# **LlamaIndex / RAG Pipeline Evaluation**
**Goal:**
Build a RAG Evaluation Notebook that compares:
* LangChain RAG (you already built)
* LlamaIndex RAG (new)

...on the same document using Gemini embeddings + Gemini LLM
and evaluates:
* Retrieval quality
* Latency
* Cost (approx / mock)

### Import Libraries & Load API Key

In [1]:
import os
import time
import pandas as pd
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Configure the Gemini API key
import google.generativeai as genai
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("GOOGLE_API_KEY not found. Please create a .env file with your key.")
genai.configure(api_key=api_key)

print("✅ Libraries imported and API key configured.")

✅ Libraries imported and API key configured.


  from .autonotebook import tqdm as notebook_tqdm


### Load the Document

In [2]:
# Load the knowledge base document
try:
    with open("../data/sample_document.txt", "r") as f:
        document_text = f.read()
    print("📘 Document loaded successfully.")
    # print(document_text)
except FileNotFoundError:
    print("Error: sample_document.txt not found. Please create it first.")

📘 Document loaded successfully.


In [3]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("../data/").load_data()
print("Loaded docs:", len(documents))


Loaded docs: 1


### LangChain RAG Pipeline

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA

# 1. Initialize Gemini Models
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
gemini_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.3)

# 2. Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = text_splitter.create_documents([document_text])

# 3. Create a Vector Store (FAISS) with Gemini Embeddings
try:
    langchain_vector_store = FAISS.from_documents(documents, gemini_embeddings)
    langchain_retriever = langchain_vector_store.as_retriever()
    print("✅ LangChain Vector Store created.")

    # 4. Create the LangChain RetrievalQA chain
    langchain_rag_chain = RetrievalQA.from_chain_type(
        llm=gemini_llm,
        chain_type="stuff",
        retriever=langchain_retriever
    )
    print("✅ LangChain RAG Chain is ready.")

except Exception as e:
    print(f"An error occurred: {e}")

✅ LangChain Vector Store created.
✅ LangChain RAG Chain is ready.


### LlamaIndex RAG Pipeline

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.google import GeminiEmbedding
from llama_index.llms.gemini import Gemini


# 1. Configure LlamaIndex to use Gemini models
# Note: LlamaIndex uses a global Settings object for configuration
Settings.llm = Gemini(model_name="models/gemini-2.0-flash")
Settings.embed_model = GeminiEmbedding(model_name="models/text-embedding-004")
Settings.chunk_size = 500
Settings.chunk_overlap = 50

print("✅ LlamaIndex Settings configured for Gemini.")

# 2. Load documents and build the index
# LlamaIndex can read directly from a file/directory
try:
    llama_documents = SimpleDirectoryReader(input_files=["../data/sample_document.txt"]).load_data()
    llama_index = VectorStoreIndex.from_documents(llama_documents)
    print("✅ LlamaIndex Vector Store Index created.")

    # 3. Create the LlamaIndex Query Engine
    llama_query_engine = llama_index.as_query_engine()
    print("✅ LlamaIndex Query Engine is ready.")

except Exception as e:
    print(f"An error occurred: {e}")

✅ LlamaIndex Settings configured for Gemini.
✅ LlamaIndex Vector Store Index created.
✅ LlamaIndex Query Engine is ready.


### RAG Evaluation Logic

In [None]:

# Define a set of questions to test the RAG pipelines
evaluation_questions = [
    "What is the Mediterranean diet?",
    "What are the main health benefits of this diet?",
    "What is the primary source of fat mentioned?",
    "How does the diet help improve cholesterol levels?",
    "What role do antioxidants play in the Mediterranean diet?"
]

print(f" Evaluation will be run on {len(evaluation_questions)} questions.")

🧪 Evaluation will be run on 5 questions.


In [None]:
results = []
print(" Starting evaluation...")

for question in evaluation_questions:
    print(f"\nEvaluating question: '{question}'")

    # --- LangChain Evaluation ---
    lc_start_time = time.time()
    lc_response = langchain_rag_chain.invoke({"query": question})
    lc_end_time = time.time()
    lc_latency = lc_end_time - lc_start_time
    # Retrieve the source documents used by LangChain
    lc_retrieved_docs = langchain_retriever.get_relevant_documents(question)
    lc_context = "\n\n".join([doc.page_content for doc in lc_retrieved_docs])

    # --- LlamaIndex Evaluation ---
    li_start_time = time.time()
    li_response = llama_query_engine.query(question)
    li_end_time = time.time()
    li_latency = li_end_time - li_start_time
    # LlamaIndex response object contains the source nodes
    li_context = "\n\n".join([node.get_content() for node in li_response.source_nodes])

    # Store results for this question
    results.append({
        "question": question,
        "langchain_answer": lc_response['result'],
        "langchain_context": lc_context,
        "langchain_latency": lc_latency,
        "llamaindex_answer": str(li_response),
        "llamaindex_context": li_context,
        "llamaindex_latency": li_latency
    })

print("\n✅ Evaluation complete.")

# Convert results to a pandas DataFrame for easier analysis
df_results = pd.DataFrame(results)

🚀 Starting evaluation...

Evaluating question: 'What is the Mediterranean diet?'


  lc_retrieved_docs = langchain_retriever.get_relevant_documents(question)



Evaluating question: 'What are the main health benefits of this diet?'

Evaluating question: 'What is the primary source of fat mentioned?'

Evaluating question: 'How does the diet help improve cholesterol levels?'

Evaluating question: 'What role do antioxidants play in the Mediterranean diet?'

✅ Evaluation complete.


### Evaluate Quality with LLM-as-a-Judge

In [None]:
from google.generativeai.types import GenerationConfig

# This is a simple LLM-as-a-judge prompt template
EVALUATION_PROMPT_TEMPLATE = """
You are an impartial AI judge. Evaluate the quality of a generated answer based on a given context and question.
Your evaluation should be a score from 1 to 5, where 5 is the best.
Do not provide any explanation, just the integer score.

**Question:**
{question}

**Retrieved Context:**
---
{context}
---

**Generated Answer:**
---
{answer}
---

**Evaluation Criteria:**
1.  **Relevance**: Does the context contain the information needed to answer the question?
2.  **Faithfulness**: Is the answer fully supported by the provided context? There should be no hallucinations.
3.  **Conciseness**: Is the answer concise and to the point?

**Your Score (1-5):**
"""

evaluation_llm = genai.GenerativeModel('gemini-2.0-flash')
config = GenerationConfig(temperature=0.0) # Set temp to 0 for deterministic scoring

def evaluate_response(question, context, answer):
    """Uses Gemini to evaluate the quality of a RAG response."""
    prompt = EVALUATION_PROMPT_TEMPLATE.format(question=question, context=context, answer=answer)
    try:
        response = evaluation_llm.generate_content(prompt, generation_config=config)
        # Safely parse the score
        score = int(response.text.strip())
        return score
    except (ValueError, IndexError):
        # Handle cases where the LLM doesn't return a clean integer
        return 0 # Return a default score on failure
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return 0

# Apply the evaluation function to our results
df_results['langchain_score'] = df_results.apply(
    lambda row: evaluate_response(row['question'], row['langchain_context'], row['langchain_answer']), axis=1
)

df_results['llamaindex_score'] = df_results.apply(
    lambda row: evaluate_response(row['question'], row['llamaindex_context'], row['llamaindex_answer']), axis=1
)

print("✅ Quality scores have been calculated.")

Error during evaluation: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 15
Please retry in 54.154815166s. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 54
}
]
Error during evaluation: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/g

### Display Final Results

In [None]:
# For a cleaner final view, let's select and reorder columns
final_df = df_results[[
    'question',
    'langchain_score',
    'llamaindex_score',
    'langchain_latency',
    'llamaindex_latency',
    'langchain_answer',
    'llamaindex_answer'
]]

# Calculate averages
avg_lc_score = final_df['langchain_score'].mean()
avg_li_score = final_df['llamaindex_score'].mean()
avg_lc_latency = final_df['langchain_latency'].mean()
avg_li_latency = final_df['llamaindex_latency'].mean()

print("--- 📊 Evaluation Results ---")
display(final_df)

print("\n--- 📈 Average Metrics ---")
summary_data = {
    "Framework": ["LangChain", "LlamaIndex"],
    "Avg. Quality Score (1-5)": [f"{avg_lc_score:.2f}", f"{avg_li_score:.2f}"],
    "Avg. Latency (seconds)": [f"{avg_lc_latency:.2f}", f"{avg_li_latency:.2f}"]
}
summary_df = pd.DataFrame(summary_data)
display(summary_df)

--- 📊 Evaluation Results ---


Unnamed: 0,question,langchain_score,llamaindex_score,langchain_latency,llamaindex_latency,langchain_answer,llamaindex_answer
0,What is the Mediterranean diet?,5,0,2.622672,2.015608,"The Mediterranean diet is a heart-healthy, pla...","It is a heart-healthy, plant-based eating plan..."
1,What are the main health benefits of this diet?,5,0,2.555379,2.33201,The Mediterranean diet has several health bene...,This eating plan is associated with a lower ri...
2,What is the primary source of fat mentioned?,5,0,1.369761,1.463135,Olive oil is the primary source of fat mention...,Olive oil is the primary source of fat mention...
3,How does the diet help improve cholesterol lev...,5,0,1.877143,2.013202,The Mediterranean diet's emphasis on healthy f...,"The diet's focus on healthy fats, such as thos..."
4,What role do antioxidants play in the Mediterr...,5,0,2.072845,2.325169,Antioxidants play a key role in the Mediterran...,"Antioxidants, abundant in fresh produce, play ..."



--- 📈 Average Metrics ---


Unnamed: 0,Framework,Avg. Quality Score (1-5),Avg. Latency (seconds)
0,LangChain,5.0,2.1
1,LlamaIndex,0.0,2.03


### Mock Cost Calculation

In [None]:
def estimate_cost(text_input, text_output, is_embedding=False):
    input_chars = len(text_input)
    output_chars = len(text_output)

    if is_embedding:
        return (input_chars / 1000) * 0.0001

    input_cost = (input_chars / 1000) * 0.000125
    output_cost = (output_chars / 1000) * 0.000375
    return input_cost + output_cost

# 1. Embedding cost (one-time)
embedding_cost = estimate_cost(document_text, "", is_embedding=True)

# 2. Querying cost (per query)
lc_query_cost = df_results.apply(
    lambda row: estimate_cost(row['question'] + row['langchain_context'], row['langchain_answer']), axis=1
).sum()

li_query_cost = df_results.apply(
    lambda row: estimate_cost(row['question'] + row['llamaindex_context'], row['llamaindex_answer']), axis=1
).sum()

print("--- 💰 Estimated Costs ---")
print(f"One-time Embedding Cost: ${embedding_cost:.6f}")
print(f"Total LangChain Query Cost: ${lc_query_cost:.6f}")
print(f"Total LlamaIndex Query Cost: ${li_query_cost:.6f}")

--- 💰 Estimated Costs ---
One-time Embedding Cost: $0.000067
Total LangChain Query Cost: $0.000971
Total LlamaIndex Query Cost: $0.000677
