## 1. Import library

In [50]:
import nest_asyncio
nest_asyncio.apply()

In [51]:
from llama_index.core import Settings, Document, VectorStoreIndex, SummaryIndex, StorageContext, load_index_from_storage
from llama_index.core.tools import QueryEngineTool
from llama_index.core.selectors import LLMSingleSelector
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.llms.openai import OpenAI
from openai import RateLimitError
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator, RetrieverEvaluator, DatasetGenerator, BatchEvalRunner
from tqdm import tqdm
import random
import time

import pickle
import os

## 2. Building RAG System

In [52]:
def retry_with_exponential_backoff(
    func,
    initial_delay: float = 1,
    exponential_base: float = 2,
    jitter: bool = True,
    max_retries: int = 15,
    errors: tuple = (RateLimitError,),
):
    """Retry a function with exponential backoff."""

    def wrapper(*args, **kwargs):
        # Initialize variables
        num_retries = 0
        delay = initial_delay

        # Loop until a successful response or max_retries is hit or an exception is raised
        while True:
            try:
                return func(*args, **kwargs)

            # Retry on specific errors
            except errors as e:
                # Increment retries
                num_retries += 1
                print('Number of retries: ',num_retries)

                # Check if max retries has been reached
                if num_retries > max_retries:
                    raise Exception(
                        f"Maximum number of retries ({max_retries}) exceeded."
                    )

                # Increment the delay
                delay *= exponential_base * (1 + jitter * random.random())

                # Sleep for the delay
                time.sleep(delay)

            # Raise exceptions for any errors not specified
            except Exception as e:
                raise e

    return wrapper

def query_chatbot(query_engine, query):
    query_engine.query(query)

@retry_with_exponential_backoff
def query_with_backoff(**kwargs):
    return query_chatbot(**kwargs)

def query_from_llama_chatbot(query, query_engine,retriever):
    # Query 
    # response = query_with_backoff(query_engine=query_engine, query=query)
    response = query_engine.query(query)
    # Retrieve relevant documents
    nodes = retriever.retrieve(query)
    
    source = nodes[0].node.text
    title = source.split('Title:')[1].split('\n')[0].strip()
    session_title = source.split('Session title:')[1].split('\n')[0].strip()
    content = source.split('Content:')[1].strip()

    print('Answer: ', str(response))
    print('-'*3)

    print('Source used:')
    print('- Blog Title: ', title)

    if not (session_title == 'None'):
        print('- Session title: ', session_title)

    print('- Content: \n', content)
    print('-'*3)

    return response, source
    
with open('../API_Key') as file:
    os.environ['OPENAI_API_KEY'] = file.read() 

# Load the crawl dict from the file
with open('crawl_content.pkl', 'rb') as crawl_file:
    crawl_content = pickle.load(crawl_file)

# Setting embedding and LLLM model
Settings.llm = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), model="gpt-4o-mini")  
Settings.embed_model = OpenAIEmbedding(api_key=os.getenv('OPENAI_API_KEY'), model="text-embedding-3-small") 
print("Loaded the embed model.")
print('---')

storage_context_name = 'storage_openai'

if not (os.path.exists(f'./{storage_context_name}/vector_store') and os.path.exists(f'./{storage_context_name}/summary_store')):   
    # Save data into Document object
    documents = [Document(text=content) for content in tqdm(crawl_content)]
    print(f"Loaded {len(documents)} documents.")
    print('---')

    # Create vector and summary store index from documents
    print("There are no index storage yet")
    print("Start to create indexes...")
    vector_index = VectorStoreIndex.from_documents(documents,show_progress=True)
    summary_index = SummaryIndex.from_documents(documents,show_progress=True)
    print("Created the indexes.")

    # Store the vector and summary indexes
    print("Start to store the indexes...")
    vector_index.storage_context.persist(persist_dir=f'./{storage_context_name}/vector_store')
    summary_index.storage_context.persist(persist_dir=f'./{storage_context_name}/summary_store')
    print('Store the indexes')
    print('---')
else:
    print('Already created indexes')
    # Load the vector and summary store index
    vector_storage_context = StorageContext.from_defaults(persist_dir=f"./{storage_context_name}/vector_store")
    vector_index = load_index_from_storage(vector_storage_context)

    summary_storage_context = StorageContext.from_defaults(persist_dir=f"./{storage_context_name}/summary_store")
    summary_index = load_index_from_storage(summary_storage_context)

    print('Loaded indexes from storage')
    print('---')

# Create the query engine from vector store index
vector_query_engine = vector_index.as_query_engine(similarity_top_k = 1)
summary_query_engine = summary_index.as_query_engine(response_mode = "tree_summarize", use_async = True)
print('Created the query engine')
print('---')

# Create tools for query engine
summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine, 
    description="Useful for summarization questions related to the llama-index blogs")

vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description="Useful for retreiving specific information in the llama-index blogs"
)
print("Created query tool")
print("-"*3)


query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        summary_tool,
        vector_tool,
    ],
    verbose=True
)

print("Created the router query engine.")
print('-'*3)

# Create the retriever
retriever = vector_index.as_retriever(similarity_top_k = 2)
print('Created the retriever')
print('---')

Loaded the embed model.
---
Already created indexes
Loaded indexes from storage
---
Created the query engine
---
Created query tool
---
Created the router query engine.
---
Created the retriever
---


In [53]:
# Evaluator
faithfulness_eval = FaithfulnessEvaluator()
relevancy_eval = RelevancyEvaluator()
correctness_eval = CorrectnessEvaluator()

runner = BatchEvalRunner(
    {'faithfulness': faithfulness_eval,
     'relevancy': relevancy_eval},
     workers=8
)

In [62]:
def ask_chatbot(query):
    print ("Now querying...")
    # Query  ---------------------------------------------
    print (f"Querying: {query}")
    print('-'*3)
    query_from_llama_chatbot(query=query,query_engine=query_engine, retriever= retriever)

    print('Evaluation:')
    eval_results = runner.evaluate_queries(vector_index.as_query_engine(), queries=[query])

    print('Faithfulness:')
    print('- Passing: ', eval_results["faithfulness"][0].passing)
    print('- Score: ', eval_results["faithfulness"][0].score)
    print('-'*3)

    print('Relevancy')
    print('- Passing: ', eval_results["relevancy"][0].passing)
    print('- Score: ', eval_results["faithfulness"][0].score)

## 3. Testing

In [64]:
question1 = "What are key features of llama-agents?"
ask_chatbot(question1)

Now querying...
Querying: What are key features of llama-agents?
---
[1;3;38;5;200mSelecting query engine 1: The question asks for specific information about key features, which aligns with retrieving specific information..
[0mAnswer:  Key features of llama-agents include a distributed service-oriented architecture where each agent operates as an independently running microservice, a communication system utilizing standardized API interfaces through a central control plane, and the ability to define both agentic and explicit orchestration flows. Additionally, it offers ease of deployment for launching, scaling, and monitoring agents independently, along with scalability and resource management through built-in observability tools to track system performance and individual agent services.
---
Source used:
- Blog Title:  Introducing llama-agents: A Powerful Framework for Building Production Multi-Agent AI Systems
- Session title:  Key Features of llama-agents
- Content: 
 Distributed S

In [65]:
question2 = "What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?"
ask_chatbot(question2)

Now querying...
Querying: What are the two critical areas of RAG system performance that are assessed in the 'Evaluating RAG with LlamaIndex' section of the OpenAI Cookbook?
---
[1;3;38;5;200mSelecting query engine 1: The question asks for specific information regarding the critical areas of RAG system performance, which aligns with retrieving specific information..
[0mAnswer:  The two critical areas of RAG system performance assessed in the 'Evaluating RAG with LlamaIndex' section are the Retrieval System and Response Generation.
---
Source used:
- Blog Title:  OpenAI Cookbook: Evaluating RAG systems
- Content: 
 We’re excited to unveil our OpenAI Cookbook, a guide to evaluating Retrieval-Augmented Generation (RAG) systems using LlamaIndex. We hope you’ll find it useful in enhancing the effectiveness of your RAG systems, and we’re thrilled to share it with you.
The OpenAI Cookbook has three sections:
Understanding Retrieval-Augmented Generation (RAG): provides a detailed overview of

In [66]:
question3 = "What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?"
ask_chatbot(question3)

Now querying...
Querying: What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?
---
[1;3;38;5;200mSelecting query engine 1: The question asks for specific information regarding metrics used to evaluate performance, which aligns with retrieving specific information..
[0mAnswer:  The two main metrics used to evaluate the performance of the different rerankers in the RAG system are hit rate and MRR (Mean Reciprocal Rank).
---
Source used:
- Blog Title:  Boosting RAG: Picking the Best Embedding & Reranker models
- Session title:  Impact of Rerankers:
- Content: 
 WithoutReranker: This provides the baseline performance for each embedding.
bge-reranker-base: Generally improves both hit rate and MRR across embeddings.
bge-reranker-large: This reranker frequently offers the highest or near-highest MRR for embeddings. For several embeddings, its performance rivals or surpasses that of the CohereRerank.
CohereRerank: Consistently enhances 