# Install Packages and Setup Variables


In [1]:
!pip install -q llama-index==0.14.0 llama-index-vector-stores-chroma==0.5.3 llama-index-llms-google-genai==0.5.0 \
                chromadb==1.0.21 llama-index-llms-openai==0.5.6 jedi==0.19.2

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00

In [2]:
import os

# Set the following API Keys in the Python environment. Will be used later.
# os.environ["OPENAI_API_KEY"] = "[OPENAI_API_KEY]"
# os.environ["GOOGLE_API_KEY"] = "<YOUR_API_KEY>"

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["GOOGLE_API_KEY"] = userdata.get('Google_api_key')

In [3]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.
import nest_asyncio

nest_asyncio.apply()

## Language Model and Embedding Model

In [4]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.llms.google_genai import GoogleGenAI

# Settings.llm = OpenAI(model="gpt-5-mini")

Settings.llm = GoogleGenAI(model="gemini-2.5-flash", temperature=1)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

**Note: You can create a vector store from scratch using the code below, or you can load it from Hugging Face using the code provided in this notebook.**

# Create a Vector Store


In [5]:
import chromadb

# create client and a new collection
# chromadb.EphemeralClient saves data in-memory.
chroma_client = chromadb.PersistentClient(path="./ai_tutor_knowledge")
chroma_collection = chroma_client.create_collection("ai_tutor_knowledge")

In [6]:
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.storage.storage_context import StorageContext

# Define a storage context object using the created vector database.
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Load the Dataset (JSON)


## Download


In [7]:
from huggingface_hub import hf_hub_download
file_path = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="ai_tutor_knowledge.jsonl",repo_type="dataset",local_dir="/content")

ai_tutor_knowledge.jsonl: 0.00B [00:00, ?B/s]

# Read File


In [8]:
import json
with open(file_path, "r") as file:
    ai_tutor_knowledge = [json.loads(line) for line in file]

len(ai_tutor_knowledge)

762

# Convert to Document obj


In [9]:
from typing import List
from llama_index.core import Document

def create_docs_from_list(data_list: List[dict]) -> List[Document]:
    documents = []
    for data in data_list:
        documents.append(
            Document(
                doc_id=data["doc_id"],
                text=data["content"],
                metadata={  # type: ignore
                    "url": data["url"],
                    "title": data["name"],
                    "tokens": data["tokens"],
                    "source": data["source"],
                },
                excluded_llm_metadata_keys=[
                    "title",
                    "tokens",
                    "source",
                ],
                excluded_embed_metadata_keys=[
                    "url",
                    "tokens",
                    "source",
                ],
            )
        )
    return documents

doc = create_docs_from_list(ai_tutor_knowledge)

# Transforming


In [10]:
from llama_index.core.node_parser import SentenceWindowNodeParser

# create the sentence window node parser
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    include_metadata=True,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

In [11]:
documents = [i for i in doc if i.metadata['tokens']<8000]
nodes = node_parser.get_nodes_from_documents(documents)

In [12]:
len(nodes)

22083

In [13]:
nodes[0]

TextNode(id_='c6a654d8-4f46-4ca7-b526-e4b19c9dbc73', embedding=None, metadata={'url': 'https://towardsai.net/p/machine-learning/bert-huggingface-model-deployment-using-kubernetes-github-repo-03-07-2024', 'title': 'BERT HuggingFace Model Deployment using Kubernetes [ Github Repo]  03/07/2024', 'tokens': 768, 'source': 'tai_blog', 'window': 'Github Repo : https://github.com/vaibhawkhemka/ML-Umbrella/tree/main/MLops/Model_Deployment/Bert_Kubernetes_deployment   Model development is useless if you dont deploy it to production  which comes with a lot of issues of scalability and portability.    I have deployed a basic BERT model from the huggingface transformer on Kubernetes with the help of docker  which will give a feel of how to deploy and manage pods on production.    Model Serving and Deployment:ML Pipeline:Workflow:   Model server (using FastAPI  uvicorn) for BERT uncased model    Containerize model and inference scripts to create a docker image    Kubernetes deployment for these mode

## Index creation

In [14]:
from llama_index.core import VectorStoreIndex

# Add the documents to the database and create Index / embeddings
index = VectorStoreIndex(nodes, storage_context=storage_context,show_progress =True)

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1603 [00:00<?, ?it/s]

In [15]:
# Compress the vector store directory to a zip file to be able to download and use later.
!zip -r vectorstore-windowed.zip ai_tutor_knowledge

  adding: ai_tutor_knowledge/ (stored 0%)
  adding: ai_tutor_knowledge/7b5f10b0-2d85-4217-ab0a-3f906f456884/ (stored 0%)
  adding: ai_tutor_knowledge/7b5f10b0-2d85-4217-ab0a-3f906f456884/length.bin (deflated 73%)
  adding: ai_tutor_knowledge/7b5f10b0-2d85-4217-ab0a-3f906f456884/header.bin (deflated 56%)
  adding: ai_tutor_knowledge/7b5f10b0-2d85-4217-ab0a-3f906f456884/link_lists.bin (deflated 78%)
  adding: ai_tutor_knowledge/7b5f10b0-2d85-4217-ab0a-3f906f456884/data_level0.bin (deflated 17%)
  adding: ai_tutor_knowledge/7b5f10b0-2d85-4217-ab0a-3f906f456884/index_metadata.pickle (deflated 44%)
  adding: ai_tutor_knowledge/chroma.sqlite3 (deflated 86%)


# Load Indexes


**Note: If you didn’t create the vector store from scratch, please uncomment the three code blocks/cells below.**

In [16]:
# from huggingface_hub import hf_hub_download
# vectorstore = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="vectorstore-windowed.zip",repo_type="dataset",local_dir="/content")

In [17]:
# !unzip vectorstore-windowed.zip

In [18]:
# import chromadb
# from llama_index.vector_stores.chroma import ChromaVectorStore
# from llama_index.core import VectorStoreIndex

# # Create your index
# db = chromadb.PersistentClient(path="./ai_tutor_knowledge")
# chroma_collection = db.get_or_create_collection("ai_tutor_knowledge")
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# # Create your index
# index = VectorStoreIndex.from_vector_store(vector_store)

In [19]:
from llama_index.core.postprocessor import MetadataReplacementPostProcessor

query_engine = index.as_query_engine(
    llm=Settings.llm,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

In [20]:
response = query_engine.query("Write about naive RAG and Speculative RAG?")
print(response)



The provided context does not explicitly define "naive RAG." However, it refers to "standard RAG approaches" and "the most competitive standard RAG model, Mistral-Instruc^{as}_{7B}," which Speculative RAG is compared against and outperforms. These standard approaches likely represent a baseline RAG without the advanced features of iterative LLM refinement or self-critique mentioned as "recent RAG advancements."

SPECULATIVE RAG is a framework designed to accelerate and improve Retrieval Augmented Generation (RAG). It leverages a larger generalist Language Model (LM) to efficiently verify multiple RAG drafts. These drafts are produced in parallel by a smaller, distilled specialist LM. Each draft is generated from a distinct subset of retrieved documents, which offers diverse perspectives on the evidence and helps reduce input token counts per draft. This approach enhances comprehension of each subset and mitigates potential position bias over long contexts. The method accelerates RAG by

In [21]:
for idx, item in enumerate(response.source_nodes):
    print("Source ", idx + 1)
    print("Original Text:", item.node.metadata["original_text"])
    print("Window:", item.node.metadata["window"])
    print("----")

Source  1
Original Text: In this work, we introduce SPECULATIVE RAG – a framework that leverages a larger generalist LM to efficiently verify multiple RAG drafts produced in parallel by a smaller, distilled specialist LM. 
Window: **Abstract**Retrieval augmented generation (RAG) combines the generative abilities of large language models (LLMs) with external knowledge sources to provide more accurate and up-to-date responses.  Recent RAG advancements focus on improving retrieval outcomes through iterative LLM refinement or self-critique capabilities acquired through additional instruction tuning of LLMs.  In this work, we introduce SPECULATIVE RAG – a framework that leverages a larger generalist LM to efficiently verify multiple RAG drafts produced in parallel by a smaller, distilled specialist LM.  Each draft is generated from a distinct subset of retrieved documents, offering diverse perspectives on the evidence while reducing input token counts per draft.  This approach enhances comp