# Install Packages and Setup Variables


In [1]:
!pip install -q llama-index==0.14.0 openai==1.107.0 chromadb==1.0.21 llama-index-vector-stores-chroma==0.5.3 \
                llama-index-embeddings-huggingface==0.6.0 llama-index-finetuning==0.4.0 jedi==0.19.2

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.5/46.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m22.3 MB/s[0m eta [36m0:00

In [2]:
import os

# Set the following API Keys in the Python environment. Will be used later.
# os.environ["OPENAI_API_KEY"] = "<YOUR_OPENAI_KEY>"

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [3]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.

import nest_asyncio

nest_asyncio.apply()

# Load the Models


In [4]:
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(model="gpt-5-mini", additional_kwargs={'reasoning_effort':'minimal'})
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Download knowledge base


In [5]:
from huggingface_hub import hf_hub_download

hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="vectorstore.zip", repo_type="dataset", local_dir=".")

vectorstore.zip:   0%|          | 0.00/97.2M [00:00<?, ?B/s]

'vectorstore.zip'

In [6]:
!unzip -o vectorstore.zip

Archive:  vectorstore.zip
   creating: ai_tutor_knowledge/
   creating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/length.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/index_metadata.pickle  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/link_lists.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/header.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/data_level0.bin  
  inflating: ai_tutor_knowledge/chroma.sqlite3  


# Create vector index

In [7]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

# Load the vector store from the local storage.
db = chromadb.PersistentClient(path="./ai_tutor_knowledge")
chroma_collection = db.get_collection("ai_tutor_knowledge")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Create the index based on the vector store.
vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

# Create keyword index

In [8]:
def retrieve_all_nodes_from_vector_index(vector_index, query="Whatever", similarity_top_k=100000000):
    # Set similarity_top_k to a large number to retrieve all the nodes
    vector_retriever = vector_index.as_retriever(similarity_top_k=similarity_top_k)

    # Retrieve all nodes
    all_nodes = vector_retriever.retrieve(query)
    nodes = [item.node for item in all_nodes]

    return nodes

nodes = retrieve_all_nodes_from_vector_index(vector_index)
print(len(nodes))

5834


In [9]:
from llama_index.core import SimpleKeywordTableIndex

# Define the KeywordTable mIndex using all the nodes.
keyword_index = SimpleKeywordTableIndex(nodes=nodes)

# Hybrid Retriever


In [10]:
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore
from llama_index.core.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KeywordTableSimpleRetriever,
)
from typing import List

class HybridRetriever(BaseRetriever):
    """Hybrid retriever that performs both semantic search and keyword search."""

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        keyword_retriever: KeywordTableSimpleRetriever,
        max_retrieve: int = 10,
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever
        self._keyword_retriever = keyword_retriever
        self._max_retrieve = max_retrieve
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        keyword_nodes = self._keyword_retriever.retrieve(query_bundle)

        resulting_nodes = []
        node_ids_added = set()
        for i in range(min(len(vector_nodes), len(keyword_nodes))):
            vector_node = vector_nodes[i]
            if vector_node.node.node_id not in node_ids_added:
                resulting_nodes += [vector_node]
                node_ids_added.add(vector_node.node.node_id)

            keyword_node = keyword_nodes[i]
            if keyword_node.node.node_id not in node_ids_added:
                resulting_nodes += [keyword_node]
                node_ids_added.add(keyword_node.node.node_id)

        return resulting_nodes

# Test hybrid retriever vs vector retriever

In [11]:
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine

# Create hybrid query engine
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=6)
keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index, num_chunks_per_query=6)
hybrid_retriever = HybridRetriever(vector_retriever, keyword_retriever, max_retrieve=6)
response_synthesizer = get_response_synthesizer(llm=Settings.llm)
hybrid_query_engine = RetrieverQueryEngine(
    retriever=hybrid_retriever,
    response_synthesizer=response_synthesizer,
)

# Test the query engine
answer = hybrid_query_engine.query("How does KOSMOS-2 work?")
print(answer)

KOSMOS-2 is a Transformer-based causal language model trained to connect language with visual regions so it can perceive and describe the visual world and ground text to specific image regions. Key aspects of how it works:

- Training data and objective
  - Trained on a large web-scale dataset of grounded image–text pairs (GRIT) using next-word prediction (causal language modeling).
- Representing spatial information
  - Bounding box coordinates from images are converted into sequences of location tokens (e.g., patch index tokens) and appended to the corresponding entity text spans. This links textual mentions to specific image regions.
- Referential format
  - Referential expressions are encoded in a link-like/Markdown style (e.g., [text span](bounding boxes)), so object descriptions are explicitly tied to location token sequences.
- Multimodal input handling
  - The model consumes both text tokens and image-derived tokens/embeddings (including the location tokens) so it can jointly r

In [12]:
# Create vector query engine
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=6)
vector_query_engine = RetrieverQueryEngine(
    retriever=vector_retriever,
    response_synthesizer=response_synthesizer,
)

# Test the query engine
answer = vector_query_engine.query("How does KOSMOS-2 work?")
print(answer)

KOSMOS-2 is a Transformer-based causal language model trained on a large web-scale dataset of grounded image–text pairs. It links textual mentions of objects to their image regions by converting bounding-box spatial coordinates into discrete location tokens and appending those tokens to the corresponding text spans. Referential expressions are represented like links (Markdown-style), so a text span is associated with a sequence of location tokens that identify the object patches in the image. The model is trained with next-word prediction on this grounded data, enabling it to perceive object descriptions, ground text to visual regions, generate referring expressions, perform referring-expression comprehension and phrase grounding, and handle perception–language and general language understanding/generation tasks. The code and pretrained models are available for use with an example processor + model pipeline that produces captions and structured entity outputs (including bounding-box co

# Evaluate

Run the following code if you want to generate an evaluation dataset from scratch. You can choose to download an evaluation dataset running the cell after this one.

In [13]:
# from llama_index.core.evaluation import generate_question_context_pairs

# # Create questions for each segment. These questions will be used to
# # assess whether the retriever can accurately identify and return the
# # corresponding segment when queried.
# rag_eval_dataset = generate_question_context_pairs(
#     nodes, llm=Settings.llm, num_questions_per_chunk=1
# )

# # We can save the evaluation dataset as a json file for later use.
# rag_eval_dataset.save_json("./rag_eval_dataset_question_context.json")

You can download a version of the evaluation dataset with the following code cell, so that you don't have to create the eval dataset from scratch with the code above.

In [14]:
from huggingface_hub import hf_hub_download
from llama_index.finetuning.embeddings.common import EmbeddingQAFinetuneDataset

# Download the evaluation dataset
hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="rag_eval_dataset_question_context_subset_50.json", repo_type="dataset", local_dir=".")
rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json("./rag_eval_dataset_question_context_subset_50.json")

(…)_dataset_question_context_subset_50.json: 0.00B [00:00, ?B/s]

In [15]:
import pandas as pd

#  A simple function to show the evaluation result.
def from_eval_results_to_dataframe(name, eval_results):
    """Convert evaluation results to a pandas dataframe."""
    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"Retriever Name": [name], "Hit Rate": [hit_rate], "MRR": [mrr]}
    )

    return metric_df

In [16]:
from llama_index.core.evaluation import RetrieverEvaluator

# We can evaluate the retievers with different top_k values.
for i in [2, 4, 6, 8, 10]:
    # Evaluate hybrid retriever
    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=i)
    keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index, num_chunks_per_query=i)
    hybrid_retriever = HybridRetriever(vector_retriever, keyword_retriever, max_retrieve=i)
    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        ["mrr", "hit_rate"], retriever=hybrid_retriever
    )
    eval_results = await retriever_evaluator.aevaluate_dataset(rag_eval_dataset)
    print(from_eval_results_to_dataframe(f"Hybrid retriever top_{i}", eval_results))

    # Evaluate vector retriever
    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=i)
    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        ["mrr", "hit_rate"], retriever=vector_retriever
    )
    eval_results = await retriever_evaluator.aevaluate_dataset(rag_eval_dataset)
    print(from_eval_results_to_dataframe(f"Vector retriever top_{i}", eval_results))

           Retriever Name  Hit Rate   MRR
0  Hybrid retriever top_2      0.64  0.58
           Retriever Name  Hit Rate   MRR
0  Vector retriever top_2       0.6  0.57
           Retriever Name  Hit Rate       MRR
0  Hybrid retriever top_4      0.72  0.593524
           Retriever Name  Hit Rate       MRR
0  Vector retriever top_4      0.68  0.593333
           Retriever Name  Hit Rate      MRR
0  Hybrid retriever top_6       0.8  0.60169
           Retriever Name  Hit Rate       MRR
0  Vector retriever top_6       0.7  0.597333
           Retriever Name  Hit Rate       MRR
0  Hybrid retriever top_8      0.86  0.606101
           Retriever Name  Hit Rate       MRR
0  Vector retriever top_8      0.78  0.608405
            Retriever Name  Hit Rate       MRR
0  Hybrid retriever top_10      0.86  0.606101
            Retriever Name  Hit Rate       MRR
0  Vector retriever top_10       0.8  0.610627
