## Install Packages and Setup Variables


In [1]:
!pip install -q openai==1.107.0 llama-index==0.14.0 llama-index-vector-stores-qdrant==0.8.4 chromadb==1.0.21 \
                llama-index-vector-stores-chroma==0.5.3 jedi==0.19.2

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00

In [2]:
import os

# Set the following API Keys in the Python environment. Will be used later.
# os.environ["OPENAI_API_KEY"] = "[OPENAI_API_KEY]"

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [3]:
import nest_asyncio

nest_asyncio.apply()

# Load a Model


In [4]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.llm = OpenAI(model="gpt-5-mini", additional_kwargs={'reasoning_effort':'minimal'})
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

## Create a VectoreStore


In [5]:
from qdrant_client import QdrantClient
from llama_index.vector_stores.qdrant import QdrantVectorStore

# qdrant_client =i QdrantClient(location=":memory:")
# or Persist storage
qdrant_client = QdrantClient(path="/content/")

vector_store = QdrantVectorStore(client=qdrant_client, collection_name="ai_tutor_knowledge")

# Load the Dataset (JSON)


## Download


In [6]:
from huggingface_hub import hf_hub_download
file_path = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="ai_tutor_knowledge.jsonl",repo_type="dataset",local_dir="/content")

ai_tutor_knowledge.jsonl: 0.00B [00:00, ?B/s]

## Read File


In [7]:
import json
with open(file_path, "r") as file:
    ai_tutor_knowledge = [json.loads(line) for line in file]
ai_tutor_knowledge[1]['content']

"Github Repo: https://github.com/vaibhawkhemka/ML-Umbrella/tree/main/NLP/Product-Categorization   From e-commerce to Customer support  all businesses require some kind of NER model to process huge amounts of texts from users.   To automate this whole  one requires NER models to extract relevant and important entities from text.   Final Result/OutputInput text = EL D68 (Green  32 GB) 3 GB RAM [3 GB RAM U+007C 32 GB ROM U+007C Expandable Upto 128 GB  15.46 cm (6.088 inch) Display  13MP Rear Camera U+007C 8MP Front Camera  4000 mAh Battery  Quad-Core Processor]   Output =   Green ->>>> COLOR 32 GB ->>>> STORAGE 3 GB RAM ->>>> RAM 3 GB RAM ->>>> RAM 32 GB ROM ->>>> STORAGE Expandable Upto 128 GB ->>>> EXPANDABLE_STORAGE 15.46 cm (6.088 inch) ->>>> SCREEN_SIZE 13MP Rear Camera ->>>> BACK_CAMERA 8MP Front Camera ->>>> FRONT_CAMERA 4000 mAh Battery ->>>> BATTERY_CAPACITY Quad-Core Processor ->>>> PROCESSOR_CORE   Data PreparationA tool for creating this dataset (https://github.com/tecoholic/n

In [8]:
# Not necessary to use full dataset
documents = ai_tutor_knowledge[:100]+ai_tutor_knowledge[500:]

# Transforming


In [9]:
from typing import List
from llama_index.core import Document

def create_docs_from_list(data_list: List[dict]) -> List[Document]:
    documents = []
    for data in data_list:
        documents.append(
            Document(
                doc_id=data["doc_id"],
                text=data["content"],
                metadata={  # type: ignore
                    "url": data["url"],
                    "title": data["name"],
                    "tokens": data["tokens"],
                    "source": data["source"],
                },
                excluded_llm_metadata_keys=[
                    "title",
                    "tokens",
                    "source",
                ],
                excluded_embed_metadata_keys=[
                    "url",
                    "tokens",
                    "source",
                ],
            )
        )
    return documents

doc = create_docs_from_list(documents)
doc[2]

Document(id_='45501b72-9391-529e-8e5e-59a2604ba26e', embedding=None, metadata={'url': 'https://towardsai.net/p/machine-learning/adaboost-explained-from-its-original-paper', 'title': 'AdaBoost Explained From Its Original Paper', 'tokens': 1697, 'source': 'tai_blog'}, excluded_embed_metadata_keys=['url', 'tokens', 'source'], excluded_llm_metadata_keys=['title', 'tokens', 'source'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text="This publication is meant to show a very popular ML algorithm in complete detail  how it works  the math behind it  how to execute it in Python and an explanation of the proofs of the original paper. There will be math and code  but it is written in a way that allows you to decide which are the fun parts.   A bit on the origins of the algorithm: It was proposed by Yoav Freund and Robert E. Schapire in a 1997 paper  A Decision-Theoretic Generalization of On-Line Learning a

In [10]:
from llama_index.core.node_parser import TokenTextSplitter

# Define the splitter object that split the text into segments with 512 tokens,
# with a 128 overlap between the segments.
text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)

In [11]:
from llama_index.core.extractors import (
    KeywordExtractor,
)
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline

# Create the pipeline to apply the transformation on each chunk,
# and store the transformed text in the chroma vector store.
pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        KeywordExtractor(keywords=10, llm=Settings.llm),
        OpenAIEmbedding(model="text-embedding-3-small"),
    ],
    vector_store=vector_store,
)

# Run the transformation pipeline.
nodes = pipeline.run(documents=doc, show_progress=True)

Parsing nodes:   0%|          | 0/362 [00:00<?, ?it/s]

100%|██████████| 1181/1181 [09:08<00:00,  2.15it/s]


Generating embeddings:   0%|          | 0/1181 [00:00<?, ?it/s]

  self._client.create_payload_index(


In [12]:
!zip ai_tutor_knowledge_metadata.zip /content/collection/ai_tutor_knowledge

  adding: content/collection/ai_tutor_knowledge/ (stored 0%)


In [13]:
len(nodes)

1181

In [14]:
nodes[0].metadata

{'url': 'https://towardsai.net/p/machine-learning/bert-huggingface-model-deployment-using-kubernetes-github-repo-03-07-2024',
 'title': 'BERT HuggingFace Model Deployment using Kubernetes [ Github Repo]  03/07/2024',
 'tokens': 768,
 'source': 'tai_blog',
 'excerpt_keywords': 'BERT, HuggingFace, Kubernetes, Docker, Model Deployment, FastAPI, Uvicorn, Minikube, Transformer, Inference'}

In [15]:
from llama_index.core import VectorStoreIndex

# Create the index based on the vector store.
index = VectorStoreIndex.from_vector_store(vector_store)

In [16]:
query_engine = index.as_query_engine(similarity_top_k=10)
res = query_engine.query("Explain how Advance RAG works?")

res.response

'Advanced RAG approaches extend basic retrieval-augmented generation by adding modules or workflows that improve how retrieved documents are selected, refined, and used by the generator. Key techniques and how they work:\n\n- Standard RAG (baseline)\n  - Retrieve a set of top-k documents and place them directly into the LM prompt as context. The generator must reason over the whole set at once. This is simple but can create long prompts, increased latency, and position-bias or redundancy problems.\n\n- Self-Reflective RAG\n  - The generator is specialized (instruction-tuned) to emit self-reflection signals or tags during generation. Those signals trigger dynamic retrieval and internal critique steps: the model can request additional evidence, reassess relevance of retrieved snippets, or filter out irrelevant items before finalizing the answer. This requires instruction tuning of the LM to produce and act on those tags.\n\n- Corrective RAG (CRAG)\n  - Introduces an external evaluator mo

# Metadata Filtering


In [26]:
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, FilterOperator

filters = MetadataFilters(
    filters=[
        MetadataFilter(
            key="excerpt_keywords",
            operator=FilterOperator.TEXT_MATCH,
            value="PEFT",
        ),
    ]
)

# Query Dataset


In [27]:
# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.
query_engine = index.as_query_engine(filters=filters)

res = query_engine.query("How Parameter efficient fine tuning (PEFT) Works?")

In [28]:
res.response

'Parameter-efficient fine-tuning (PEFT) reduces the compute, memory, and storage cost of adapting large pretrained language models by changing only a small, targeted portion of model parameters instead of updating every weight. The general workflow and main approaches are:\n\n- Start from a pretrained LLM whose full weights remain mostly unchanged. Provide a task-specific dataset (e.g., dialogue summarization, QA, customer data) and train to adapt model behavior for the target task.\n\n- Avoid full fine-tuning because it requires updating every weight and carries large costs beyond weights (optimizer states, gradients, activations, temporary memory). PEFT keeps those costs far lower by limiting what is updated.\n\nThree principal PEFT approaches:\n1. Selective\n  - Pick a subset of the original model parameters (e.g., certain layers or modules) and fine-tune only those. All other parameters remain frozen.\n\n2. Reparameterization\n  - Replace or augment parts of the model with a low-ra

In [29]:
# Show the retrieved nodes
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Score\t", src.metadata["excerpt_keywords"])
    print("-_" * 20)

Node ID	 7163ca21-6425-4911-960b-c8cccb267513
Title	 Fine-Tuning and Evaluating Large Language Models: Key Benchmarks and Metrics
Text	 SAMSUM is one of the datasets that FLAN T5 uses. There are several pre-trained FLAN T5 models that have been fine-tuned on SAMSUM  including Phil Schmid/flan-t5-base-samsum and jasonmcaffee/flan-t5-large-samsum on Hugging Face. If we want to fine-tune the FLAN T5 model specifically for formal dialogue conversations  we can do so using the DIALOGUESUM dataset.   Models fine-tuned on DialogSum can be applied to areas like customer support  meeting minutes generation  chatbot summarization  and more.   2. PEFT (Parameter efficient fine tuning)Training LLMs is computationally intensive. Full finetuning is computationally expensive as it might change each weight in the model. First  we start with a pretrained LLM like GPT-3. This model already has a vast amount of knowledge and understanding of language. Then we provide task-specific datasets  which could b

# Filter Metadata (source_name)


In [30]:
from llama_index.core.vector_stores import MetadataFilter,MetadataFilters,FilterOperator,FilterCondition

filters = MetadataFilters(
    filters=[
        MetadataFilter(
            key="excerpt_keywords",
            operator=FilterOperator.TEXT_MATCH,
            value="BERT",
        ),
        MetadataFilter(
            key="source", operator=FilterOperator.EQ, value="tai_blog"
        ),
    ],
    condition=FilterCondition.AND,
)

In [31]:
# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.
query_engine = index.as_query_engine(filters=filters)

result = query_engine.query("Explain BERT?")

In [32]:
result.response

'BERT (Bidirectional Encoder Representations from Transformers) is a transformer-based language model that uses only the encoder component of the original transformer architecture. Its key characteristics and training approach:\n\n- Bidirectional processing: BERT simultaneously considers tokens to the left and right of a given token, allowing it to capture richer context than unidirectional models that read only left-to-right or right-to-left. This bidirectional view helps disambiguate words (e.g., deciding whether "bank" means riverbank or financial institution using surrounding words).\n\n- Architecture variants: Common sizes include BERT BASE (12 layers, ~110M parameters, 12 attention heads, hidden size 768) and BERT LARGE (24 layers, ~340M parameters, 12 attention heads, hidden size 1024).\n\n- Pre-training tasks:\n  - Masked Language Modeling (MLM): About 15% of input tokens are masked and the model is trained to predict those masked tokens, which forces learning contextualized to

In [33]:
# Show the retrieved nodes
for src in result.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Score\t", src.metadata["excerpt_keywords"])
    print("-_" * 20)

Node ID	 23902446-efe1-4538-819c-ffd60fbdbe48
Title	 Attention is all you need: How Transformer Architecture in NLP started.
Text	 GPT-2  GPT-3  and GPT-4  which were decoder-only architectures. Another well-known example is BERT (Bidirectional Encoder Representations from Transformers)  an encoder-only transformer mode used as a component in sentence embedding models.   Lets talk about BERT!BERT stands for Bidirectional Encoder Representations from Transformers. It is a language model by Google that uses a transformer architecture to understand and generate human-like language. BERT is designed to simultaneously process text in both directions  allowing it to capture context more effectively than traditional unidirectional models  which read text sequentially from left to right or right to left.   Example of Bidirectional CapabilityConsider the sentence:    The bank is situated on the _______ of the river.   In a unidirectional model  understanding the blank would primarily rely on th

In [34]:
# When Mismatch between Keyword (value) and Query

filters = MetadataFilters(
    filters=[
        MetadataFilter(
            key="excerpt_keywords",
            operator=FilterOperator.TEXT_MATCH,
            value="BERT",
        ),
        MetadataFilter(
            key="source", operator=FilterOperator.EQ, value="tai_blog"
        ),
    ],
    condition=FilterCondition.AND,
)

query_engine = index.as_query_engine(filters=filters)

result = query_engine.query("Explain PEFT?")

print(result.response)


# Show the retrieved nodes
for src in result.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Score\t", src.metadata["excerpt_keywords"])
    print("-_" * 20)


The provided excerpts do not mention PEFT (Parameter-Efficient Fine-Tuning) or define it. They describe BERT pretraining tasks (Masked Language Modeling and Next Sentence Prediction), tokens like CLS and SEP, cross-encoders, and the general fine-tuning workflow for tasks such as text classification (e.g., using BertForSequenceClassification and labeled examples). No information about PEFT methods, techniques, or examples appears in the excerpts.
Node ID	 397c95b6-aa19-442a-8f08-eb28c26c4a51
Title	 Attention is all you need: How Transformer Architecture in NLP started.
Text	 two tasks:   Masked Language Modeling (MLM):The inputs are sentences that start with a special token called CLS (Classify Token) and end with a SEP (separator token).   Words  tokens (consider)   Around 15% of the input tokens are masked  and the model is trained to predict those masked tokens.   The model learns to produce contextualized vectors based on the surrounding words at this stage. Read the example above a