In [11]:
import os
import fitz
import re

from ollama import Client
import faiss
import pandas as pd
import numpy as np
import Stemmer
from tqdm import tqdm
import gradio as gr

from llama_index.core import Document
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.retrievers import BaseRetriever, QueryFusionRetriever
from llama_index.core.schema import TextNode, NodeWithScore
from llama_index.retrievers.bm25 import BM25Retriever

# Connect to Ollama Server

In [12]:
client = Client(
  host='http://localhost:11434',
)

# Ingestion

In [13]:
# Path to the dataset folder
DATASET_PATH = '/mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2'

def extract_text_from_pdfs(folder_path):
    texts = []
    metadata = []
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                folder_name = os.path.basename(root)
                print(f"Extracting text from {pdf_path}...")
                
                doc = fitz.open(pdf_path)
                for page_num, page in enumerate(doc, start=1):
                    text = page.get_text()
                    if text.strip():
                        texts.append(text.strip())
                        metadata.append({
                            "source": pdf_path,
                            "folder": folder_name,
                            "title": file,
                            "page": page_num
                        })
                    else:
                        print(f"WARNING: {file} page {page_num} not processed...")
    return texts, metadata

In [14]:
docs, metadatas = extract_text_from_pdfs(DATASET_PATH)

Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-INITIAL-REG.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-MAIRDOE-NEW.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-SETTLEMENT.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/1-CC2024-SP.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/10-CC2024-DL-CC-P-NP.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/10-CC2024-MV-CONDUCTION-STICKER.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/10-CC2024-RELEASING.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/11-CC2024-DL-ENHANCE.pdf...
Extracting text from /mnt/c/Users/Jeryl Salas/Documents/AI 351/Project/Dataset2/11-CC2024-MV-CONDUCTION-VERIFICATION.pdf...
Extracting t

In [15]:
documents = [Document(text=docs[t], metadata=metadatas[t]) for t in range(len(docs))]
splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    separator=" ",
)
nodes = splitter.get_nodes_from_documents(documents)

# Embedding and Retrieval

## Dense via FAISS

In [16]:
def generate_embeddings(nodes, client, model):
    # Generate embeddings for documents using Ollama
    for doc in tqdm(nodes):
        response = client.embeddings(prompt=doc.text, model=model)
        doc.embedding = response["embedding"]
    return nodes

In [17]:
class FaissIndexer:
    """
    Faiss-based indexer for efficient similarity search using inner-product (cosine) similarity.

    This class handles the creation and management of a FAISS index from node embeddings.
    
    :ivar faiss_index: The FAISS index for storing and querying embeddings.
    :vartype faiss_index: faiss.IndexFlatIP
    :ivar embedding_dim: Dimensionality of the embeddings.
    :vartype embedding_dim: int
    """

    def __init__(self):
        """
        Initialize the FaissIndexer class.

        :ivar faiss_index: The FAISS index, initialized as None.
        :ivar embedding_dim: The dimension of embeddings, initialized as None.
        """
        self.faiss_index = None
        self.embedding_dim = None

    def normalize_embeddings(self, embeddings):
        """
        Normalize embeddings to have unit L2 norm.

        :param embeddings: Array of embeddings to normalize.
        :type embeddings: np.ndarray
        :return: Normalized embeddings.
        :rtype: np.ndarray
        """
        return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    def build_index(self, nodes):
        """
        Build the FAISS index from a list of nodes containing embeddings.

        :param nodes: List of nodes, where each node contains an `embedding` attribute.
        :type nodes: list
        :raises ValueError: If the nodes list is empty or embeddings are inconsistent.
        """
        if not nodes:
            raise ValueError("Nodes list cannot be empty.")
        
        embeddings = np.array([np.array(node.embedding) for node in nodes])
        normalized_embeddings = self.normalize_embeddings(embeddings)

        self.embedding_dim = normalized_embeddings[0].shape[0]
        self.faiss_index = faiss.IndexFlatIP(self.embedding_dim)  # Inner-product similarity
        self.faiss_index.add(normalized_embeddings)

    def get_index(self):
        """
        Get the FAISS index instance.

        :return: The FAISS index used for similarity search.
        :rtype: faiss.IndexFlatIP
        :raises ValueError: If the index has not been built.
        """
        if self.faiss_index is None:
            raise ValueError("Index has not been built yet. Call 'build_index' first.")
        return self.faiss_index

In [18]:
class FAISSVectorStoreRetriever(BaseRetriever):
    def __init__(self, faiss_index, documents):
        """
        Initialize the FAISS retriever.
        :param faiss_index: The FAISS index containing precomputed embeddings.
        :param documents: List of document chunks.
        :param embeddings: Precomputed embeddings corresponding to the document chunks.
        """
        self.faiss_index = faiss_index
        self.documents = documents

    def _retrieve(self, query_embedding, top_k=5):
        """
        Retrieve the top-k nearest neighbors using the FAISS index.
        :param query_embedding: The embedding of the query.
        :param top_k: Number of top results to retrieve.
        """

        norm_query_embedding = np.array([query_embedding])
        norm_query_embedding /= np.linalg.norm(norm_query_embedding, axis=1, keepdims=True)

        distances, indices = self.faiss_index.search(norm_query_embedding, top_k)
        retrieved_docs = [
            NodeWithScore(node=self.documents[idx], score=1 - dist)
            for idx, dist in zip(indices[0], distances[0])
            if idx != -1
        ]
        return retrieved_docs

In [19]:
# embedding
nodes_embed = generate_embeddings(nodes, client, "mxbai-embed-large")

  0%|          | 0/2592 [00:00<?, ?it/s]

100%|██████████| 2592/2592 [02:22<00:00, 18.22it/s]


In [20]:
#indexing
index = FaissIndexer()
index.build_index(nodes_embed)
faiss_index = index.get_index()

faiss_retriever = FAISSVectorStoreRetriever(faiss_index=faiss_index,documents=nodes_embed)

## Sparse Embedding via BM25

In [21]:
# bm25_retriever = BM25Retriever.from_defaults(
#     nodes=nodes,
#     similarity_top_k=5,
#     stemmer=Stemmer.Stemmer("english"),
#     language="english",
# )

## Hybrid Retrieval via Reciprocal Rank

In [22]:
def hybrid_embedding(results: dict, top_k: int):
    x = QueryFusionRetriever
    ranked_results = QueryFusionRetriever._reciprocal_rerank_fusion(x, results)
    return ranked_results[:top_k]

# Post Retrieval

## Summarization

In [23]:
def summarize_each_chunk(nodes, client, query, model="llama3.2", parent=False):
    if parent:
        chunks = [doc.text for doc in nodes]
    else:
        chunks = [doc.node.text for doc in nodes]
    summaries = []
    
    for i, chunk in enumerate(chunks):
        prompt = f"""
        Summarize the following text in one concise paragraph, focusing on key points relevant to the query: "{query}".
        
        - Emphasize information directly related to the query.
        - Exclude unrelated, redundant, or speculative details.
        - Do NOT introduce new information or answer the query itself. 
        
        Text:
        {chunk}
        
        Summary:
        """
        
        response = client.generate(model=model, prompt=prompt)
        summary = response['response'].strip()
        summaries.append(summary)

    return summaries

# Generation

In [24]:
def generate_response_with_notice(summaries, query, client, model="llama3.2"):
    # Combine summaries into context block
    context = "\n".join(summaries)
    
    # Create prompt to answer based on summarized text
    prompt = f"""
    Use the following summarized information to answer the query accurately and concisely. 
    DO NOT USE BACKGROUND KNOWLEDGE OUTSIDE THE CONTEXT PROVIDED.
    If the information is not sufficient to fully address the query, respond ONLY with:
    "The available information is insufficient to provide a complete answer to this query."

    Summarized Context:
    {context}
    
    Query:
    {query}
    
    Response:
    """
    
    # Send the prompt to Ollama
    response = client.generate(
        model=model,
        prompt=prompt
    )
    
    return response['response'].strip()

In [25]:
# def generate_response_with_notice(processed_query, query, client, model="llama3.2"):
#     # Combine summaries into context block
#     context = "\n".join(summaries)
    
#     # Create prompt to answer based on summarized text
#     prompt = f"""
#     Use the following summarized information to answer the query accurately and concisely. 
#     DO NOT USE BACKGROUND KNOWLEDGE OUTSIDE THE CONTEXT PROVIDED.
#     If the information is not sufficient to fully address the query, respond ONLY with:
#     "The available information is insufficient to provide a complete answer to this query."

#     Summarized Context:
#     {context}
    
#     Query:
#     {query}
    
#     Response:
#     """
    
#     # Send the prompt to Ollama
#     response = client.generate(
#         model=model,
#         prompt=prompt
#     )
    
#     return response['response'].strip()

In [26]:
def generate_response_with_notice(processed_query, client, model="llama3.2"):
    # Combine summaries into context block
    # context = "\n".join(summaries)
    
    # Create prompt to answer based on summarized text
    prompt = f"""
    Instruction:
    {processed_query}
    
    Response:
    """
    
    # Send the prompt to Ollama
    response = client.generate(
        model=model,
        prompt=prompt
    )
    
    return response['response'].strip()

# Querying

## Query Transforms

In [27]:
# Few-shot examples
FEW_SHOTS = """


Example 2:
Question: What will happen when your front tire blows out?
A. The back end will sway towards the side of the blowout
B. The back end will sway away from the blowout
C. The front end will pull towards the side of the blowout
D. The front end will pull to the opposite side of the blowout
Please answer only in letters
Correct Answer: [C]

Example 3:
Question: What should you do when an ambulance comes up behind you flashing red lights and/or sounding its siren?
A. Stop as soon as you can
B. Maintain your speed, let the ambulance driver will find a way around you
C. Speed up so that you don't hold the ambulance
D. Pull over to the right and slow down or even stop if necessary
Please answer only in letters
Correct Answer: [D]
"""

def paraphrase_query(query,client,model):

    few_shot_examples = '''
    Example 1:
    Transform this into a Retrieval Augmented Generation query: What is AI?
    Output: "Retrieve information on artificial intelligence and provide a concise definition."

    Example 2:
    Transform this into a Retrieval Augmented Generation query: How does reinforcement learning work?
    Output: "Retrieve explanations of reinforcement learning algorithms with examples."

    Example 3:
    Transform this into a Retrieval Augmented Generation query: Explain neural networks.
    Output: "Retrieve details on neural networks and explain their structure and function."

    Now, transform the following query and provide the response inside double quotes:
    '''

    full_prompt = few_shot_examples + f"\nQuery: {query}"

    response = client.generate(
        model=model,
        prompt=full_prompt
    )
    # Extract text within double quotes
    matches = re.findall(r'"([^"]*)"', response['response'])

    print(f"Query: {query}")
    print(f"Transformed: {matches if matches else 'No match found'}")
    print('-' * 60)

    if matches:
        return matches[0]
    else:
        return query


def cot_qa_format(query, context, client, model):
    return f"""
    Let's break down the problem step by step:
    1. Identify the key components of the following question: {paraphrase_query(query, client, model)}
    2. Analyze relevant facts and eliminate unnecessary details.
    3. Provide the best possible answer based on the reasoning process.

    Here is relevant context about the query:
    {context}
    """


'''
            # 3. Apply Chain-of-Thought (CoT) prompt transformation
            new_query = cot_qa_format(prompt, combined_context)
            result1 = ollama_llm.complete(new_query)
            result2 = ollama_llm.complete(new_query)
            result3 = ollama_llm.complete(new_query)
            #print(f"result 1: {result1}")
            #print(f"result 2: {result2}")
            #print(f"result 3: {result3}")

            # Aggregate results for final prompt
            final_query_with_results = f"""
            Based on the results:
            - Result 1: {result1.text}
            - Result 2: {result2.text}
            - Result 3: {result3.text}

            Now, synthesize these results and answer this question:
            {paraphrase_query(prompt) + "And answer which of the choices match the synthesized results."}
            {prompt_with_choice}

            \nPlease answer only in letters and put them inside a bracket '[]'. Don't add anything but the letters. No explanation. If you don't see the answer from the choices using the information you retrieved, use background knowledge.
            Here are two examples on how to answer the question:
            {FEW_SHOTS}

            """
            #print(final_query_with_results)
            answer = ollama_llm.complete(final_query_with_results)
            print(answer)

            # 4. Format References
            ref_text = "\n".join(
                [f"Source {i+1}: {doc.metadata['title']} (Page {doc.metadata['page']}, Folder: {doc.metadata['folder']})"
                 for i, doc in enumerate(retrieved_docs)]
            )

            # 5. Store results
            results.append({
                "question": prompt,
                "answer": answer.text,
                "references": ref_text
            })

            answers.append(answer.text)

        except Exception as e:
            results.append({
                "question": prompt,
                "answer": str(e),
                "references": ""
            })
            print(f"Error processing prompt: {e}")

        # i+=1
        # if i==1:
        #     return answers
    
    return answers

'''

'\n            # 3. Apply Chain-of-Thought (CoT) prompt transformation\n            new_query = cot_qa_format(prompt, combined_context)\n            result1 = ollama_llm.complete(new_query)\n            result2 = ollama_llm.complete(new_query)\n            result3 = ollama_llm.complete(new_query)\n            #print(f"result 1: {result1}")\n            #print(f"result 2: {result2}")\n            #print(f"result 3: {result3}")\n\n            # Aggregate results for final prompt\n            final_query_with_results = f"""\n            Based on the results:\n            - Result 1: {result1.text}\n            - Result 2: {result2.text}\n            - Result 3: {result3.text}\n\n            Now, synthesize these results and answer this question:\n            {paraphrase_query(prompt) + "And answer which of the choices match the synthesized results."}\n            {prompt_with_choice}\n\n            \nPlease answer only in letters and put them inside a bracket \'[]\'. Don\'t add anything b

## Query Generation

In [28]:
docstore = {}

# Store documents using full metadata as the key
for doc in documents:
    key = tuple(doc.metadata.items())  # Convert metadata to tuple for hashable key
    docstore[key] = doc

In [29]:
def get_document_by_chunk_metadata(chunk_node):
    # Convert chunk metadata to tuple for matching
    metadata_key = tuple(chunk_node.metadata.items())

    # Retrieve document from docstore
    document = docstore.get(metadata_key)
    return document

In [30]:
def remove_duplicate_documents(doc_list):
    seen_ids = set()
    unique_docs = []

    for doc in doc_list:
        if doc.doc_id not in seen_ids:
            seen_ids.add(doc.doc_id)
            unique_docs.append(doc)

    return unique_docs

In [31]:
def gen_query(query, top_k, client, mode='dense', summary=False, model="llama3.2", chunks_only=False):
    response = client.embeddings(prompt=query, model="mxbai-embed-large")
    query_embedding = response["embedding"]

    top_k_docs = faiss_retriever._retrieve(query_embedding, top_k=top_k)

    bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=top_k,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
    )
    retrieved_nodes = bm25_retriever.retrieve(query)

    results = {'faiss': top_k_docs, 'bm25':retrieved_nodes}
    ranked_results = hybrid_embedding(results, top_k=top_k)

    if mode == 'dense':
        print('using FAISS')
        ans_nodes =top_k_docs
    elif mode == 'sparse':
        print('using BM25')
        ans_nodes = retrieved_nodes
    else:
        print('using Hybrid')
        ans_nodes = ranked_results

    parent_flag = True
    context = set([get_document_by_chunk_metadata(docs).text for docs in ans_nodes])
    if chunks_only:
        parent_flag = False
        print('using chunks only')
        context = [docs.node.text for docs in ans_nodes]
        
    if summary:
        print('using summaries')
        context_nodes = remove_duplicate_documents([get_document_by_chunk_metadata(docs) for docs in ans_nodes])

        if chunks_only:
            context_nodes=ans_nodes
        summaries = summarize_each_chunk(context_nodes, client, model='llama3.2:latest', query=query,parent=parent_flag)
        context = summaries
    instruction = cot_qa_format(query, context,client, model='llama3.2:latest')
    answer = generate_response_with_notice(instruction, client, model=model)

    # Format the references
    references = []
    for i, doc in enumerate(ranked_results[:top_k], start=1):
        metadata = doc.metadata
        source_info = f"Source {i}: {metadata['title']} (Page {metadata['page']}, Folder: {metadata['folder']})"
        references.append(source_info)

    return answer, "\n".join(references)

# Evaluation

In [32]:
# Generate prompts dynamically
def generate_prompt(row):
    options = []
    for choice in ['A', 'B', 'C', 'D', 'E']:
        # Check for NaN or blank values
        if pd.notna(row[choice]) and row[choice] != '':
            options.append(f"{choice}. {row[choice]}")
    
    # Construct the prompt with few-shot examples
    prompt = f"\nActual Question: {row['Question']}\n" + "\n".join(options)
    prompt += "\nPlease answer only in letters and put them inside a bracket '[]'. If the question contains the statement 'Check all that apply' then add comma separator if there are multiple answers ONLY IF ALLOWED."
    
    return prompt

In [33]:
# Load the Excel file
file_path = 'data/LTO_EXAM.csv'
df = pd.read_csv(file_path)
df['Prompt'] = df.apply(generate_prompt, axis=1)
display(df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'data/LTO_EXAM.csv'

In [None]:
qr_range = (23,28)
df["AI"] = np.nan
ai_answer = []
for i in tqdm(range(*qr_range)):
    ai_answer.append(gen_query(df.loc[i,"Prompt"], top_k=15, client=client, mode='hybrid', model="llama3.3"))

df.loc[qr_range[0]:qr_range[1]-1, "AI"] = [answ[0] for answ in ai_answer]
df.loc[qr_range[0]:qr_range[1]-1, ["Question","Answer","AI"]]

  0%|                                             | 0/5 [00:00<?, ?it/s]

using Hybrid
Query: 
Actual Question: To aobtain one's driver's license, one must be at least:
A. 16 years old
B. 17 years old
C. 18 years old
Please answer only in letters and put them inside a bracket '[]'. If the question contains the statement 'Check all that apply' then addd comma separator if there are multiple answers ONLY IF ALLOWED.
Transformed: ["Retrieve information on driver's license requirements, focusing on age restrictions.", '[A, C]']
------------------------------------------------------------


 20%|███████▍                             | 1/5 [00:52<03:31, 52.97s/it]

using Hybrid
Query: 
Actual Question: A pre-trip inspection should be completed:
A. before and after operating the motor vehicle
B. after operating the motor vehicle
C. before operating the motor vehicle
Please answer only in letters and put them inside a bracket '[]'. If the question contains the statement 'Check all that apply' then addd comma separator if there are multiple answers ONLY IF ALLOWED.
Transformed: ['[C, A]']
------------------------------------------------------------


 40%|██████████████▊                      | 2/5 [01:47<02:42, 54.10s/it]

using Hybrid
Query: 
Actual Question: It is not conidered safe driving on an expressway when:
A. driver is driving at 80 kph
B. driver is driving at 60 kph
C. driver keep changing lanes without signaling
Please answer only in letters and put them inside a bracket '[]'. If the question contains the statement 'Check all that apply' then addd comma separator if there are multiple answers ONLY IF ALLOWED.
Transformed: ['Retrieve information on driving safety rules and provide answers to the following questions: What speed is considered safe for an expressway? Is it acceptable to change lanes without signaling?', '[A]']
------------------------------------------------------------


 60%|██████████████████████▏              | 3/5 [02:46<01:52, 56.27s/it]

using Hybrid
Query: 
Actual Question: When you do not see the wheels of the vehicles in front of you, should you do?
A. Slow down and get back to a safer following distance
B. Turn your fog light on
C. Increase your speed until it becomes visible
Please answer only in letters and put them inside a bracket '[]'. If the question contains the statement 'Check all that apply' then addd comma separator if there are multiple answers ONLY IF ALLOWED.
Transformed: ['Retrieve information on safe driving practices during low visibility conditions and provide guidance on appropriate actions to take when unable to see the wheels of vehicles in front.', 'check all that apply']
------------------------------------------------------------


 80%|█████████████████████████████▌       | 4/5 [03:52<01:00, 60.10s/it]

using Hybrid
Query: 
Actual Question: Your speed while driving at night should keep on:
A. the distance that you can see where you come to a complete stop within distance that your headlights can light up.
B. the speed since you have enough lighting anyways.
C. a slow speed to avoid road crash
Please answer only in letters and put them inside a bracket '[]'. If the question contains the statement 'Check all that apply' then addd comma separator if there are multiple answers ONLY IF ALLOWED.
Transformed: ['Retrieve information on safe driving practices during nighttime and provide guidance on speed limits.']
------------------------------------------------------------


100%|█████████████████████████████████████| 5/5 [04:29<00:00, 53.84s/it]
  df.loc[qr_range[0]:qr_range[1]-1, "AI"] = [answ[0] for answ in ai_answer]


Unnamed: 0,Question,Answer,AI
23,"To aobtain one's driver's license, one must be...",B,The provided text appears to be a collection o...
24,A pre-trip inspection should be completed:,C,It appears you've provided a large document co...
25,It is not conidered safe driving on an express...,C,It seems like you've provided a large document...
26,When you do not see the wheels of the vehicles...,A,It appears you've provided a large collection ...
27,Your speed while driving at night should keep on:,A,It appears that you've provided a collection o...


# Visualization

In [None]:
# Gradio interface with dynamic model, mode selection, and top_k slider
iface = gr.Interface(
    fn=lambda query, top_k, model, mode: gen_query(
        query=query,
        top_k=top_k,
        client=client,
        mode=mode,
        model=model
    ),
    inputs=[
        gr.Textbox(label="Enter your query"),
        gr.Slider(1, 20, value=5, step=1, label="Top K Results"),  # Slider for top_k (1 to 20)
        gr.Dropdown(
            choices=["llama3.3", "llama3.2:latest", "llama3.1:8b"],
            value="llama3.3",
            label="Select Model"
        ),
        gr.Dropdown(
            choices=["hybrid", "dense", "sparse"],
            value="hybrid",
            label="Select Retrieval Mode"
        )
    ],
    outputs=[
        gr.Textbox(label="Answer", lines=6),
        gr.Textbox(label="References", lines=10),
    ],
    title="RAG System with LLaMA Models",
    description=(
        "Ask questions and get answers with references from PDF documents. "
        "Adjust Top-K to control the number of retrieved chunks. "
        "Choose different models and retrieval modes for customization."
    )
)

# Launch the Gradio interface
iface.launch()

* Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.




In [None]:
# Handle closing behavior
def on_close():
    iface.close()
on_close()

Closing server running on port: 7867
