### Imports and Path setup

In [31]:
from pathlib import Path
import chromadb
import pickle
import os
from dotenv import load_dotenv
load_dotenv()

multiquery_rag_output_path = "../RAG Results/multiquery_rag_results.txt"
Relative_Database_path = "./chroma_Data"
Absolute_Database_path = Path(Relative_Database_path).resolve()
file_path = "../Chunking/Chunk_files/julius-caesar_chunks_semantic.pkl"
# Create a new collection with a unique name
collection_name = "anlp_rag_collection"
# # Set API key
# os.environ["GOOGLE_API_KEY"] = os.environ.get("GEMINI_API_KEY")


### Chroma Setup and Chunk Loading
Sets up persistant client and loads previously computed chunks

In [32]:
# Initialize the persistent client
client = chromadb.PersistentClient(path=Absolute_Database_path)
print(f"[INFO] ChromaDB client initialized at: {Absolute_Database_path}")

# List existing collections
existing_collections = client.list_collections()
print(f"Existing collections: {[c.name for c in existing_collections]}")

[INFO] ChromaDB client initialized at: C:\Users\micro\Desktop\Abhinav college\Resources\Sem 7\Advanced NLP\Assignment 2\RAG-A2\VectorDB\chroma_Data
Existing collections: ['anlp_rag_collection']


In [33]:

# No need for fitz or RecursiveCharacterTextSplitter here, as we are loading from a file.


loaded_docs = []

try:
    with open(file_path, "rb") as f: # 'rb' mode for reading in binary
        loaded_docs = pickle.load(f)
    print(f"Successfully loaded {len(loaded_docs)} chunks from '{file_path}'.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"Error loading file: {e}")

# Now you can inspect the loaded documents to verify.
print("\nHere is the metadata of a loaded chunk:")
if loaded_docs:
    print(loaded_docs[0].metadata)

Successfully loaded 126 chunks from '../Chunking/Chunk_files/julius-caesar_chunks_semantic.pkl'.

Here is the metadata of a loaded chunk:
{'source': '../julius-caesar_PDF_FolgerShakespeare.pdf', 'page_number': 3, 'c': 'semantic', 'ischunk': True}


### Set up Embedding Function
Will use default SentenceTransformer for generating embeddings

In [34]:
# Install if needed
# !pip install sentence_transformers

# Set up embedding function
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
print("Embedding function initialized with model: all-MiniLM-L6-v2")

Embedding function initialized with model: all-MiniLM-L6-v2


### Creating new Collection

In [35]:
from datetime import datetime

# FORCE DELETE the collection if it exists
try:
    client.delete_collection(name=collection_name)
    print(f"[INFO] Deleted existing collection '{collection_name}'")
except Exception as e:
    print(f"[INFO] No existing collection named '{collection_name}' to delete.")

# Create a FRESH collection
collection = client.create_collection(
    name=collection_name,
    embedding_function=embedding_function,
    metadata={
        "description": "Julius Caesar Chunks collection for RAG",
        "created": str(datetime.now())
    }
)

print(f"[SUCCESS] Fresh collection '{collection_name}' created successfully")
print(f"Current count in collection: {collection.count()}")

[INFO] Deleted existing collection 'anlp_rag_collection'
[SUCCESS] Fresh collection 'anlp_rag_collection' created successfully
Current count in collection: 0


### Add data to collection
The chunks have to be given an id and added to the collection now

In [36]:
import uuid
import hashlib

# Extract document name from file path for ID generation
# This will give us "julius-caesar" from the file path
doc_name = file_path.split('/')[-1].split('_chunks')[0]

# Prepare documents for ChromaDB
ids = []
documents = []
metadatas = []

print(f"[INFO] Generating IDs with prefix: {doc_name}")

# Process each loaded document chunk
for i, doc in enumerate(loaded_docs):
    # Generate a deterministic ID based on document name and index
    # This ensures all chunks from the same document have consistent IDs
    doc_id = f"{doc_name}_chunk_{i}"
    
    # Get the document text
    document_text = doc.page_content
    
    # Get the document metadata
    metadata = doc.metadata
    
    # Add to our lists
    ids.append(doc_id)
    documents.append(document_text)
    metadatas.append(metadata)

print(f"[INFO] Prepared {len(ids)} documents with IDs like: {ids[0]}, {ids[1] if len(ids) > 1 else 'N/A'}...")

# Add documents in batches to avoid memory issues
batch_size = 500
total_added = 0

for i in range(0, len(ids), batch_size):
    end_idx = min(i + batch_size, len(ids))
    
    # Simply add all documents (collection is fresh, no need to update)
    collection.add(
        ids=ids[i:end_idx],
        documents=documents[i:end_idx],
        metadatas=metadatas[i:end_idx]
    )
    
    total_added += end_idx - i
    print(f"[INFO] Added batch: {i} to {end_idx-1} ({end_idx-i} documents)")

print(f"\n[SUCCESS] Added {total_added} documents to collection '{collection_name}'")
print(f"[INFO] All chunks have IDs in format: {doc_name}_chunk_<number>")

[INFO] Generating IDs with prefix: julius-caesar
[INFO] Prepared 126 documents with IDs like: julius-caesar_chunk_0, julius-caesar_chunk_1...
[INFO] Added batch: 0 to 125 (126 documents)

[SUCCESS] Added 126 documents to collection 'anlp_rag_collection'
[INFO] All chunks have IDs in format: julius-caesar_chunk_<number>
[INFO] Added batch: 0 to 125 (126 documents)

[SUCCESS] Added 126 documents to collection 'anlp_rag_collection'
[INFO] All chunks have IDs in format: julius-caesar_chunk_<number>


In [37]:
# Check collection count
count = collection.count()
print(f"Total documents in collection: {count}")

# Peek at the first few entries
peek = collection.peek(limit=3)
print("\nSample entries:")
for i, (doc_id, doc_text, metadata) in enumerate(zip(
    peek['ids'], peek['documents'], peek['metadatas']
)):
    print(f"\n--- Document {i+1} ---")
    print(f"ID: {doc_id}")
    print(f"Text: {doc_text[:100]}...")
    print(f"Metadata: {metadata}")

Total documents in collection: 126

Sample entries:

--- Document 1 ---
ID: julius-caesar_chunk_0
Text: Michael Witmore
Director, Folger Shakespeare Library
It is hard to imagine a world without Shakespea...
Metadata: {'c': 'semantic', 'page_number': 3, 'ischunk': True, 'source': '../julius-caesar_PDF_FolgerShakespeare.pdf'}

--- Document 2 ---
ID: julius-caesar_chunk_1
Text: Until now, with the release of The Folger Shakespeare (formerly
Folger Digital Texts), readers in se...
Metadata: {'source': '../julius-caesar_PDF_FolgerShakespeare.pdf', 'c': 'semantic', 'page_number': 4, 'ischunk': True}

--- Document 3 ---
ID: julius-caesar_chunk_2
Text: At
any point in the text, you can hover your cursor over a bracket for
more information. Because the...
Metadata: {'page_number': 5, 'source': '../julius-caesar_PDF_FolgerShakespeare.pdf', 'ischunk': True, 'c': 'semantic'}


## Quantitative Analysis using RAGAs: Faithfulness and Answer Relevency

### Querying the Database

In [38]:
# Rich table for displaying results (optional but nice)
try:
    from rich.console import Console
    from rich.table import Table
    
    console = Console()
    use_rich = True
except ImportError:
    use_rich = False
    print("Rich package not found. Using standard print.")

# Function to display query results
def print_results(results, use_rich=use_rich):
    if use_rich:
        table = Table(show_header=True, header_style="bold magenta")
        table.add_column("Rank", width=6)
        table.add_column("Document ID")
        table.add_column("Document Text", width=60)
        table.add_column("Page")
        table.add_column("Distance")
        
        docs = results['documents'][0]
        ids = results['ids'][0]
        metas = results['metadatas'][0]
        distances = results['distances'][0]
        
        for i, (doc, doc_id, meta, dist) in enumerate(zip(docs, ids, metas, distances)):
            table.add_row(
                str(i+1),
                doc_id,
                (doc[:100] + "...") if len(doc) > 100 else doc,
                str(meta.get('page_number', 'N/A')),
                f"{dist:.4f}"
            )
        
        console.print(table)
    else:
        # Standard print version
        for i, (doc, meta, dist) in enumerate(zip(
            results['documents'][0], 
            results['metadatas'][0], 
            results['distances'][0]
        )):
            print(f"\n--- Result {i+1} ---")
            print(f"Text: {doc[:100]}...")
            print(f"Metadata: {meta}")
            print(f"Distance: {dist:.4f}")

In [39]:
# Run a sample query
query = "What themes are explored in Julius Caesar?"
results = collection.query(
    query_texts=[query],
    n_results=3,
    include=["documents", "metadatas", "distances"]
)

print(f"\nResults for query: '{query}'")
print_results(results)


Results for query: 'What themes are explored in Julius Caesar?'


### Natural Language Generation

In [40]:
!pip install google-generativeai langchain-google-genai



In [41]:
import os
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI


# Initialize Gemini (fixed the model name - using a valid Gemini model)
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", temperature=0.7)

DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.

In [None]:
from langchain.prompts import PromptTemplate

# Better prompt template for Julius Caesar
rag_prompt_template = """
You are an expert on Shakespeare's Julius Caesar. Answer questions using ONLY the context below.
If you can't find a complete answer in the context but see partial information, try to provide what you can find and acknowledge the limitations of the available information.
If there is NO relevant information at all in the context, respond with "I don't have enough information to answer this question."

Context:
{context}

Question: {query}

Answer (based only on the context provided):
"""

prompt = PromptTemplate(
    template=rag_prompt_template,
    input_variables=["context", "query"]
)

In [None]:
!pip install rank_bm25

In [None]:
from rank_bm25 import BM25Okapi
import numpy as np

def answer_with_hybrid_rag(query, n_results=5):
    # 1. Semantic search with ChromaDB
    semantic_results = collection.query(
        query_texts=[query],
        n_results=n_results,
        include=["documents", "metadatas", "distances"]
    )
    
    # 2. Perform keyword search with BM25
    # First get all documents to search across
    all_docs = collection.get(
        limit=100,  # Adjust based on your collection size
        include=["documents", "metadatas"]
    )
    
    # Tokenize for BM25
    tokenized_docs = [doc.split() for doc in all_docs["documents"]]
    bm25 = BM25Okapi(tokenized_docs)
    
    # Get BM25 scores
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    
    # Get top BM25 results
    top_bm25_indices = np.argsort(bm25_scores)[-n_results:][::-1]
    
    # 3. Combine results (simple union)
    combined_docs = []
    combined_meta = []
    combined_ids = [] 
    seen_ids = set()
    
    # Add semantic results
    for doc, meta, doc_id in zip(
        semantic_results["documents"][0], 
        semantic_results["metadatas"][0],
        semantic_results["ids"][0]
    ):
        if doc_id not in seen_ids:
            combined_docs.append(doc)
            combined_meta.append(meta)
            combined_ids.append(doc_id)
            seen_ids.add(doc_id)
    
    # Add keyword results
    for idx in top_bm25_indices:
        doc_id = all_docs["ids"][idx]
        if doc_id not in seen_ids:
            combined_docs.append(all_docs["documents"][idx])
            combined_meta.append(all_docs["metadatas"][idx])
            combined_ids.append(doc_id)
            seen_ids.add(doc_id)
    
    # Limit to n_results total
    combined_docs = combined_docs[:n_results]
    combined_meta = combined_meta[:n_results]
    combined_ids = combined_ids[:n_results]
    
    # Format context and complete RAG as before
    formatted_docs = []
    for doc, meta in zip(combined_docs, combined_meta):
        page_num = meta.get("page_number", "unknown")
        formatted_docs.append(f"[Page {page_num}]: {doc}")
    
    context = "\n\n---\n\n".join(formatted_docs)
    filled_prompt = prompt.format(context=context, query=query)
    response = llm.invoke(filled_prompt)
    
    # Create a mock results object for print_results compatibility
    mock_results = {
        "documents": [combined_docs],
        "metadatas": [combined_meta],
        "distances": [[0.0] * len(combined_docs)],  # Placeholder distances
        "ids": [combined_ids]
    }
    
    return {
        "query": query,
        "answer": response.content if hasattr(response, 'content') else str(response),
        "source_documents": mock_results
    }

In [None]:
# Test our RAG pipeline with a question
test_query = "What is the relationship between Brutus and Caesar?"
response = answer_with_hybrid_rag(test_query)

print(f"Question: {test_query}")
print(f"\nAnswer: {response['answer']}")
print("\nSources:")
print_results(response["source_documents"])

In [None]:
# Test with multiple questions to evaluate system
results_for_export = []

test_questions = [
    "What are the main themes in Julius Caesar?",
    "How does Brutus justify killing Caesar?",
    "What role does Cassius play in the conspiracy?"
]

for question in test_questions:
    print("\n" + "="*50)
    print(f"Question: {question}")
    response = answer_with_hybrid_rag(question)
    print(f"\nAnswer: {response['answer']}")
    print("\nTop source:")
    if len(response["source_documents"]["documents"][0]) > 0:
        top_doc = response["source_documents"]["documents"][0][0]
        top_meta = response["source_documents"]["metadatas"][0][0]
        page = top_meta.get("page_number", "N/A")
        print(f"[Page {page}]:\n{top_doc[:200]}...")  # Print first 200 chars
        # Save for export
        results_for_export.append({
            "question": question,
            "answer": response['answer'],
            "page": page,
            "chunk": top_doc
        })
    else:
        print("No sources found.")
        results_for_export.append({
            "question": question,
            "answer": response['answer'],
            "page": None,
            "chunk": None
        })

# Export results to a well-formatted text file
with open(multiquery_rag_output_path, "w", encoding="utf-8") as f:
    f.write("RAG Multi-Query Evaluation Results\n")
    f.write("="*60 + "\n\n")
    for idx, res in enumerate(results_for_export, 1):
        f.write(f"Question {idx}: {res['question']}\n")
        f.write(f"Answer:\n{res['answer']}\n\n")
        if res["chunk"]:
            f.write(f"Top Source Chunk (Page {res['page']}):\n{res['chunk']}\n")
        else:
            f.write("Top Source Chunk: No sources found.\n")
        f.write("-"*60 + "\n\n")
print(f"\nResults exported to {multiquery_rag_output_path}")

In [None]:
# Cell 8: Evaluate RAG pipeline using RAGAS
# Faithfulness & Answer Relevancy
# Ollama for RAG generation | Zephyr for offline RAGAS evaluation

# ==== INSTALL DEPENDENCIES ====
# !pip install ragas datasets transformers accelerate sentence-transformers tqdm
# Make sure you have Ollama installed: https://ollama.ai/download
# Example to pull a model: `ollama pull llama3` or `ollama pull mistral`

import os
import json
import subprocess
from tqdm import tqdm
from datetime import datetime
from datasets import Dataset

from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy
from ragas.llms import HuggingfaceLLM
from ragas.embeddings import HuggingfaceEmbeddings

# ==== CONFIG ====
testbed_path = "../RAG Results/test_bed.json"
output_metrics_path = "../RAG Results/multiquery_rag_metrics.txt"
TOP_K = 3

# --- LLMs ---
OLLAMA_MODEL = "llama3"                     # for RAG generation (local via Ollama)
LLM_MODEL = "HuggingFaceH4/zephyr-7b-beta"  # for RAGAS evaluation (offline)
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # for embeddings
# =================

# 1️⃣ Load test data
with open(testbed_path, "r", encoding="utf-8") as f:
    test_data = json.load(f)

print(f"[INFO] Loaded {len(test_data)} QA pairs from testbed.")

# 2️⃣ Function to generate RAG answers using Ollama
def generate_with_ollama(prompt, model_name=OLLAMA_MODEL):
    """
    Generate a response using a local Ollama model.
    Assumes Ollama is installed and the model is already pulled.
    """
    try:
        # Run the Ollama CLI
        result = subprocess.run(
            ["ollama", "run", model_name],
            input=prompt.encode("utf-8"),
            capture_output=True,
            text=True,
            timeout=120
        )
        if result.returncode == 0:
            return result.stdout.strip()
        else:
            print(f"[WARN] Ollama returned error: {result.stderr}")
            return None
    except Exception as e:
        print(f"[ERROR] Ollama call failed: {e}")
        return None

# 3️⃣ Prepare evaluation records
records = []
for item in tqdm(test_data, desc="Generating Ollama RAG answers"):
    question = item["question"]
    ideal_answer = item["ideal_answer"]

    # --- Retrieve from Chroma ---
    retrieved = collection.query(query_texts=[question], n_results=TOP_K)
    retrieved_docs = retrieved["documents"][0]
    retrieved_context = "\n".join(retrieved_docs)

    # --- Build RAG prompt ---
    prompt = (
        f"You are a helpful assistant. "
        f"Use only the information provided in the context below to answer the question.\n\n"
        f"Context:\n{retrieved_context}\n\n"
        f"Question:\n{question}\n\nAnswer:"
    )

    # --- Generate answer using Ollama ---
    generated_answer = generate_with_ollama(prompt)
    if not generated_answer:
        generated_answer = f"[Fallback mock answer] {retrieved_docs[0][:150]}..."

    # --- Add record for RAGAS evaluation ---
    records.append({
        "question": question,
        "contexts": retrieved_docs,
        "answer": generated_answer,
        "ground_truth": [ideal_answer],
    })

# 4️⃣ Convert to Hugging Face Dataset
dataset = Dataset.from_list(records)

# 5️⃣ Initialize Zephyr & embedding models for RAGAS (offline)
llm = HuggingfaceLLM(model=LLM_MODEL)
embeddings = HuggingfaceEmbeddings(model_name=EMBED_MODEL)

# 6️⃣ Evaluate using RAGAS
print(f"\n[INFO] Evaluating with RAGAS (Faithfulness & Answer Relevancy) using {LLM_MODEL} ...")
results = evaluate(
    dataset=dataset,
    metrics=[faithfulness, answer_relevancy],
    llm=llm,
    embeddings=embeddings
)

# 7️⃣ Extract scores
faithfulness_score = results["faithfulness"]
answer_relevancy_score = results["answer_relevancy"]

# 8️⃣ Save results to file
with open(output_metrics_path, "w", encoding="utf-8") as f:
    f.write("=== RAG Evaluation Metrics (Ollama + RAGAS Offline) ===\n")
    f.write(f"Timestamp: {datetime.now()}\n\n")
    f.write(f"Faithfulness: {faithfulness_score:.4f}\n")
    f.write(f"Answer Relevancy: {answer_relevancy_score:.4f}\n\n")
    f.write("Full Results:\n")
    f.write(str(results))

print(f"\n✅ Evaluation complete! Metrics saved to '{output_metrics_path}'")
print(f"Faithfulness: {faithfulness_score:.4f} | Answer Relevancy: {answer_relevancy_score:.4f}")

ModuleNotFoundError: No module named 'datasets'