# Introduction to Advanced RAG in LlamaIndex

In [None]:
%pip install nest_asyncio



In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
%pip install -Uq llama-index

## Extract

In [None]:
from llama_index.core import SimpleDirectoryReader

docs = SimpleDirectoryReader(input_dir="./data").load_data()

# file name as id
# docs_nam_as_id = SimpleDirectoryReader(input_dir="./data", filename_as_id=True).load_data()

In [None]:
len(docs)  # one per page

19

In [None]:
import pprint
pprint.pprint(docs)

[Document(id_='6e6bc31c-24ec-46d9-a214-8010c98d526d', embedding=None, metadata={'page_label': '1', 'file_name': '2502.09838v2.pdf', 'file_path': '/content/data/2502.09838v2.pdf', 'file_type': 'application/pdf', 'file_size': 8787043, 'creation_date': '2025-02-23', 'last_modified_date': '2025-02-23'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='HealthGPT: A Medical Large Vision-Language Model for Unifying\nComprehension and Generation via Heterogeneous Knowledge Adaptation\nTianwei Lin1, Wenqiao Zhang1, Sijing Li1, Yuqian Yuan1, Binhe Yu2, Haoyuan Li3, Wanggui He3, Hao Jiang3,\nMengze Li4, Xiaohui Song1, Siliang Tang1, Jun Xiao1, Hui Lin

## Transform

In [None]:
# hide some keys from llm

docs[0].__dict__ # too much data about one doc

{'id_': '6e6bc31c-24ec-46d9-a214-8010c98d526d',
 'embedding': None,
 'metadata': {'page_label': '1',
  'file_name': '2502.09838v2.pdf',
  'file_path': '/content/data/2502.09838v2.pdf',
  'file_type': 'application/pdf',
  'file_size': 8787043,
  'creation_date': '2025-02-23',
  'last_modified_date': '2025-02-23'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text_resource': MediaResource(embeddings=None, data=None, text='HealthGPT: A Medical Large Vision-Language Model for Unifying\nComprehension and Generation via Heterogeneous Knowledge Adaptation\nTianwei Lin1, Wenqiao Zhang1, Sijing Li1, Yuqian Yuan1, Binhe Yu2, Haoyuan Li3, Wanggui He3, Hao Jiang3,\nMe

In [None]:
# quick example of what the LLM and Embeddings see when with a test document

from llama_index.core import Document
from llama_index.core.schema import MetadataMode

document = Document(
    text="This is a super-customized document",
    metadata={
        "file_name": "super_secret_document.txt",
        "category": "finance",
        "author": "LlamaIndex",
    },
    # excluded_embed_metadata_keys=["file_name"],
    excluded_llm_metadata_keys=["category"],
    metadata_seperator="\n",
    metadata_template="{key}:{value}",
    text_template="Metadata:\n{metadata_str}\n-----\nContent:\n{content}",
)

print(
    "The LLM sees this: \n",
    document.get_content(metadata_mode=MetadataMode.LLM),
)
# print(
#     "The Embedding model sees this: \n",
#     document.get_content(metadata_mode=MetadataMode.EMBED),
# )

The LLM sees this: 
 Metadata:
file_name:super_secret_document.txt
author:LlamaIndex
-----
Content:
This is a super-customized document


In [None]:
from llama_index.core.schema import MetadataMode

# print(docs[0].get_content(metadata_mode=MetadataMode.LLM))   # what the llm sees
print(docs[0].get_content(metadata_mode=MetadataMode.EMBED)) # what embeddings see. in this case, same thing

page_label: 1
file_path: /content/data/2502.09838v2.pdf

HealthGPT: A Medical Large Vision-Language Model for Unifying
Comprehension and Generation via Heterogeneous Knowledge Adaptation
Tianwei Lin1, Wenqiao Zhang1, Sijing Li1, Yuqian Yuan1, Binhe Yu2, Haoyuan Li3, Wanggui He3, Hao Jiang3,
Mengze Li4, Xiaohui Song1, Siliang Tang1, Jun Xiao1, Hui Lin1, Yueting Zhuang1, Beng Chin Ooi5
1Zhejiang University, 2University of Electronic Science and Technology of China, 3Alibaba,
4The Hong Kong University of Science and Technology,5National University of Singapore
Project Page
 Code
1. X-Ray 
Comprehension
2. CT
Comprehension
3. MRI
Comprehension
Comp. Perf.
7 Medical Multi-Modal Comprehension Tasks
Gen. 
Performance
5 Medical Multi-Modal Generation Tasks
List all anatomical locations showing 
pulmonary edema, hazy opacity, or 
mediastinal displacement.
Left hilar structures, left lung,
right hilar structures, right lung.
Which abdominal organ shows any 
indication of a lesion or abnormality 

In [None]:
for doc in docs:
    # define the content/metadata template
    doc.text_template = "Metadata:\n{metadata_str}\n---\nContent:\n{content}"

    # exclude page label from embedding
    if "page_label" not in doc.excluded_embed_metadata_keys:
        doc.excluded_embed_metadata_keys.append("page_label")

In [None]:
# after editing the content seen by embedings

print(docs[0].get_content(metadata_mode=MetadataMode.EMBED))

Metadata:
file_path: /content/data/2502.09838v2.pdf
---
Content:
HealthGPT: A Medical Large Vision-Language Model for Unifying
Comprehension and Generation via Heterogeneous Knowledge Adaptation
Tianwei Lin1, Wenqiao Zhang1, Sijing Li1, Yuqian Yuan1, Binhe Yu2, Haoyuan Li3, Wanggui He3, Hao Jiang3,
Mengze Li4, Xiaohui Song1, Siliang Tang1, Jun Xiao1, Hui Lin1, Yueting Zhuang1, Beng Chin Ooi5
1Zhejiang University, 2University of Electronic Science and Technology of China, 3Alibaba,
4The Hong Kong University of Science and Technology,5National University of Singapore
Project Page
 Code
1. X-Ray 
Comprehension
2. CT
Comprehension
3. MRI
Comprehension
Comp. Perf.
7 Medical Multi-Modal Comprehension Tasks
Gen. 
Performance
5 Medical Multi-Modal Generation Tasks
List all anatomical locations showing 
pulmonary edema, hazy opacity, or 
mediastinal displacement.
Left hilar structures, left lung,
right hilar structures, right lung.
Which abdominal organ shows any 
indication of a lesion or abno

Here are other, more advanced transformations. Some require an LLM to work. We will use Qwen 2.5 32B Instruct 128k through Groq, which is an affordble, high-rate model. It should be enough to extract Q&As and titles from the documents.

In [None]:
%pip install -Uq llama-index-llms-groq

In [None]:
from llama_index.llms.groq import Groq
import os
import getpass

os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")

Enter your Groq API key: ··········


In [None]:
llm_transformations = Groq(model="qwen-2.5-32b", api_key=os.environ["GROQ_API_KEY"])

In [None]:
# other transformations

from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(
    separator=" ", chunk_size=1024, chunk_overlap=128
)
title_extractor = TitleExtractor(llm=llm_transformations, nodes=5)
qa_extractor = QuestionsAnsweredExtractor(llm=llm_transformations, questions=3)


from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        title_extractor,
        qa_extractor
    ]
)

nodes = pipeline.run(
    documents=docs,
    in_place=True,
    show_progress=True,
)

Parsing nodes:   0%|          | 0/19 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  1.38it/s]
100%|██████████| 2/2 [00:00<00:00,  5.78it/s]
100%|██████████| 2/2 [00:00<00:00,  5.32it/s]
100%|██████████| 1/1 [00:00<00:00,  2.81it/s]
100%|██████████| 2/2 [00:00<00:00,  5.65it/s]
100%|██████████| 3/3 [00:00<00:00,  6.33it/s]
100%|██████████| 2/2 [00:00<00:00,  3.77it/s]
100%|██████████| 1/1 [00:00<00:00,  2.96it/s]
100%|██████████| 3/3 [00:00<00:00,  3.94it/s]
100%|██████████| 3/3 [00:00<00:00,  4.82it/s]
100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
100%|██████████| 1/1 [00:00<00:00,  2.96it/s]
100%|██████████| 1/1 [00:00<00:00,  3.26it/s]
100%|██████████| 1/1 [00:00<00:00,  3.32it/s]
100%|██████████| 2/2 [00:00<00:00,  5.13it/s]
100%|██████████| 1/1 [00:00<00:00,  2.74it/s]
100%|██████████| 1/1 [00:00<00:00,  3.87it/s]
100%|██████████| 1/1 [00:00<00:00,  5.89it/s]
100%|██████████| 1/1 [00:00<00:00,  5.19it/s]
100%|██████████| 30/30 [00:16<00:00,  1.85it/s]


By default, Llamaindex uses OpenAI's embedding models. But you can choose to load a free model from HuggingFace too (but it it will be slower).

In [None]:
len(nodes)

30

In [None]:
import pprint

# pprint.pprint(nodes[0].__dict__)

print(nodes[0].get_content(metadata_mode=MetadataMode.LLM))

[Excerpt from document]
page_label: 1
file_path: /content/data/2502.09838v2.pdf
document_title: Title: HealthGPT: A Comprehensive Medical Vision-Language Model for Multi-Modal Comprehension and Generation in Healthcare Applications

This title effectively encapsulates the essence of the document, suggesting that it discusses a sophisticated AI model named HealthGPT, which is designed to handle both visual and textual data in the context of healthcare applications.
questions_this_excerpt_can_answer: Based on the provided context, here are three specific questions that this document can answer, which are unlikely to be found elsewhere:

1. **What are the specific anatomical locations identified by HealthGPT as showing pulmonary edema, hazy opacity, or mediastinal displacement in a given medical image?**
   - This question is specific to the capabilities of HealthGPT and the particular medical images it has analyzed, making the answer unique to this document.

2. **How does HealthGPT inte

## Index

In [None]:
%pip install -Uq llama-index-embeddings-huggingface

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m97.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Embeddings

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

hf_embeddings = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

test_embed = hf_embeddings.get_text_embedding("Hello world")
print(test_embed)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[0.01519613154232502, -0.022570662200450897, 0.008547082543373108, -0.07417060434818268, 0.003836424555629492, 0.0027135657146573067, -0.03126790374517441, 0.04463401064276695, 0.04405517131090164, -0.007871180772781372, -0.025200815871357918, -0.033366575837135315, 0.014427902176976204, 0.04653819277882576, 0.008555065840482712, -0.01614576019346714, 0.0074058398604393005, -0.019012469798326492, -0.11472626030445099, -0.018157577142119408, 0.12635937333106995, 0.029702914878726006, 0.025281012058258057, -0.03421789035201073, -0.04099970683455467, 0.006617303937673569, 0.010270675644278526, 0.022362256422638893, 0.004436364397406578, -0.1273096203804016, -0.016149284318089485, -0.020380139350891113, 0.04721219092607498, 0.011579902842640877, 0.0681871548295021, 0.007298648823052645, -0.0178530216217041, 0.0407821349799633, -0.01026944350451231, 0.023757033050060272, 0.010602838359773159, -0.02858441136777401, 0.008159711956977844, -0.015180503949522972, 0.030896244570612907, -0.0659798

In [None]:
# create index

from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(nodes, embed_model=hf_embeddings)

## Query

In [None]:
llm_querying = Groq(model="llama-3.3-70b-versatile", api_key=os.environ["GROQ_API_KEY"])

query_engine = index.as_query_engine(llm=llm_querying)
response = query_engine.query(
    "what does this model do?"
)

print(response)

The model, specifically referred to in the context as various versions and types such as HealthGPT, Med-Flamingo, LLaV A-Med, and others, is designed for multimodal learning tasks. These tasks include visual comprehension, where the model processes and understands visual data, and generation tasks, where the model generates images or text based on given inputs. The models are also compared in terms of their performance in comprehension tasks, indicating their ability to understand and process complex information from different modalities, such as vision and language. Additionally, some models like Moelora and Lumina-MGPT are focused on specific aspects such as contrastive learning guided mixture of experts for parameter-efficient fine-tuning and photorealistic text-to-image generation, respectively.


In [None]:
response.__dict__

{'response': 'The model, specifically referred to in the context as various versions and types such as HealthGPT, Med-Flamingo, LLaV A-Med, and others, is designed for multimodal learning tasks. These tasks include visual comprehension, where the model processes and understands visual data, and generation tasks, where the model generates images or text based on given inputs. The models are also compared in terms of their performance in comprehension tasks, indicating their ability to understand and process complex information from different modalities, such as vision and language. Additionally, some models like Moelora and Lumina-MGPT are focused on specific aspects such as contrastive learning guided mixture of experts for parameter-efficient fine-tuning and photorealistic text-to-image generation, respectively.',
 'source_nodes': [NodeWithScore(node=TextNode(id_='934d50af-1ceb-457b-8563-07dbef58b550', embedding=None, metadata={'page_label': '10', 'file_name': '2502.09838v2.pdf', 'fil

## Store

In [None]:
index.storage_context.persist(persist_dir="./vectors")

In [None]:
from llama_index.core import StorageContext, load_index_from_storage

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="./vectors")

# load index
index_from_storage = load_index_from_storage(storage_context, embed_model=hf_embeddings)

In [None]:
qa = index_from_storage.as_query_engine(llm=llm_querying)

In [None]:
response = qa.query("what does this model do?")
print(response)

The model, specifically referred to in the context as various versions and types such as HealthGPT, Med-Flamingo, LLaV A-Med, and others, is designed for multimodal learning tasks. These tasks include visual comprehension, where the model processes and understands visual data, and generation tasks, where the model generates images or text based on given inputs. The models are also compared in terms of their performance in comprehension tasks, indicating their ability to understand and process complex information from different modalities, such as vision and language. Additionally, some models like Moelora and Lumina-MGPT are focused on specific aspects such as contrastive learning guided mixture of experts for parameter-efficient fine-tuning and photorealistic text-to-image generation, respectively.


# Using Vector Stores

In [None]:
%pip install -Uq chromadb
%pip install -Uq llama-index-vector-stores-chroma

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.6/278.6 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00

In [None]:
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# initialize client, setting path to save data
db = chromadb.PersistentClient(path="./chroma_db")

# create collection
chroma_collection = db.get_or_create_collection("healthGPT")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# create your index
index = VectorStoreIndex(
    nodes, storage_context=storage_context, embed_model=hf_embeddings
)

# You can also load from documents and apply transformations in place
# index = VectorStoreIndex.from_documents(
#     documents, storage_context=storage_context, transformations=[]
# )

# Or you can initialize your index from your vector store and then add the nodes
# index = VectorStoreIndex.from_vector_store(
#     vector_store=vector_store, embed_model=hf_embeddings
# )
# index.insert_nodes(nodes)


# create a query engine and query
query_engine = index.as_query_engine(llm=llm_querying)

In [None]:
response = query_engine.query("What is this model good at?")
print(response)

This model, specifically HealthGPT-L14, excels across all sub-tasks, achieving optimal or near-optimal results with an average score of 74.4, significantly surpassing other models. It is particularly good at medical visual question answering and image reconstruction tasks, demonstrating stable reconstruction performance even with a small amount of data. Additionally, it performs well in comprehension tasks, often being selected as the best answer by clinicians in human evaluation.
