# Install Packages and Setup Variables


In [1]:
!pip install -q llama-index==0.14.0 openai==1.107.0 llama-index-finetuning==0.4.1 llama-index-embeddings-huggingface==0.6.1 \
                llama-index-embeddings-cohere==0.6.1 cohere==5.18.0 llama-index-readers-web==0.5.3 chromadb==1.0.21 jedi==0.19.2 \
                llama-index-vector-stores-chroma==0.5.3 llama-index-llms-google-genai==0.5.0 google-genai==1.38.0 llama-index-llms-openai==0.5.6

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25

In [2]:
import os

# Set the following API Keys in the Python environment. Will be used later.
# os.environ["OPENAI_API_KEY"] = "<YOUR-OPENAI-API-KEY>"
# os.environ["GOOGLE_API_KEY"] = "<YOUR-GOOGLE-API-KEY>"

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["GOOGLE_API_KEY"] = userdata.get('Google_api_key')

In [3]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.
import nest_asyncio

nest_asyncio.apply()

# Load a Model


In [4]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.llm = OpenAI(model="gpt-5-mini", additional_kwargs={'reasoning_effort':'minimal'})
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Load Indexes


In [5]:
from huggingface_hub import hf_hub_download

vectorstore = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="vectorstore.zip", repo_type="dataset", local_dir=".")

vectorstore.zip:   0%|          | 0.00/97.2M [00:00<?, ?B/s]

In [6]:
!unzip -o vectorstore.zip

Archive:  vectorstore.zip
   creating: ai_tutor_knowledge/
   creating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/length.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/index_metadata.pickle  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/link_lists.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/header.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/data_level0.bin  
  inflating: ai_tutor_knowledge/chroma.sqlite3  


In [7]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

# Create the vector index
db = chromadb.PersistentClient(path="./ai_tutor_knowledge")
chroma_collection = db.get_or_create_collection("ai_tutor_knowledge")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
vector_index = VectorStoreIndex.from_vector_store(vector_store)

# Multi-Step Query Engine


## GPT-5-mini


In [8]:
from llama_index.core.indices.query.query_transform.base import (
    StepDecomposeQueryTransform,
)

step_decompose_transform_gpt5 = StepDecomposeQueryTransform(verbose=True, llm=Settings.llm)

In [9]:
from llama_index.core.query_engine.multistep_query_engine import MultiStepQueryEngine

#Default query engine
query_engine_gpt5_mini = vector_index.as_query_engine()

# Multi Step Query Engine
multi_step_query_engine = MultiStepQueryEngine(
    query_engine = query_engine_gpt5_mini,
    query_transform = step_decompose_transform_gpt5,
    index_summary = "Used to answer the Questions about RAG, Machine Learning, Deep Learning, and Generative AI, Note: Don't repeat the Same quesion",
)

# Query Dataset

## Default

In [10]:
# Default query engine
query_engine = vector_index.as_query_engine()
res = query_engine.query("Write about Llama 3.1 Model, BERT and PEFT methods")
print(res.response)

I can only answer using the provided excerpts. The available information covers LLaMA (not Llama 3.1 specifically), PEFT methods (including LoRA and Llama-Adapter), and mentions tokenizer/model classes. Based on that information:

- LLaMA model family:
  - Configuration and classes: A configuration class is provided (LlamaConfig). Tokenization is supported via LlamaTokenizer and LlamaTokenizerFast, which include utilities like build_inputs_with_special_tokens, get_special_tokens_mask, create_token_type_ids_from_sequences, and save_vocabulary; the fast tokenizer also supports update_post_processor. Model classes include LlamaModel (forward), and task-specific variants such as LlamaForCausalLM, LlamaForSequenceClassification, LlamaForQuestionAnswering, and LlamaForTokenClassification (each exposing forward). Flax implementations include FlaxLlamaModel and FlaxLlamaForCausalLM (with __call__).

- PEFT methods for LLaMA:
  - LoRA (Low-Rank Adaptation): The documentation references fine-tun

In [11]:
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("-_" * 20)

Node ID	 781b7b12-eca2-47c0-a66e-9d6be670e951
Title	 LLaMA
Text	 on how to fine-tune LLaMA model using LoRA method via the 🤗 PEFT library with intuitive UI. 🌎 - A [notebook](https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/text-generation-open-llama.ipynb) on how to deploy Open-LLaMA model for text generation on Amazon SageMaker. 🌎 ## LlamaConfig[[autodoc]] LlamaConfig## LlamaTokenizer[[autodoc]] LlamaTokenizer    - build_inputs_with_special_tokens    - get_special_tokens_mask    - create_token_type_ids_from_sequences    - save_vocabulary## LlamaTokenizerFast[[autodoc]] LlamaTokenizerFast    - build_inputs_with_special_tokens    - get_special_tokens_mask    - create_token_type_ids_from_sequences    - update_post_processor    - save_vocabulary## LlamaModel[[autodoc]] LlamaModel    - forward## LlamaForCausalLM[[autodoc]] LlamaForCausalLM    - forward## LlamaForSequenceClassification[[autodoc]] LlamaForSequenceClassif

## GPT-5-mini Multi-Step


In [12]:
response = multi_step_query_engine.query("Write about Llama 3.1 Model, BERT and PEFT methods")
print(response.response)

[1;3;33m> Current query: Write about Llama 3.1 Model, BERT and PEFT methods
[0m[1;3;38;5;200m> New query: What are the key features, differences, and typical use cases of Llama 3.1, BERT, and PEFT methods?
[0m[1;3;33m> Current query: Write about Llama 3.1 Model, BERT and PEFT methods
[0m[1;3;38;5;200m> New query: Which specific classes, methods, and example notebooks are provided for fine-tuning LLaMA 3.1 with PEFT (especially LoRA), and how do they demonstrate training and deployment (e.g., SageMaker, DeepSpeed, FSDP) for tasks like causal language modeling, sequence classification, and question answering?
[0m[1;3;33m> Current query: Write about Llama 3.1 Model, BERT and PEFT methods
[0m[1;3;38;5;200m> New query: Which specific PEFT/LoRA classes, methods, and example notebooks are available for fine-tuning and deploying Llama 3.1 (LLaMA) for tasks like causal language modeling, sequence classification, and question answering, and how do those examples demonstrate training a

In [13]:
for query, response in response.metadata['sub_qa']:
    print(f"**{query}**\n{response}\n")

**What are the key features, differences, and typical use cases of Llama 3.1, BERT, and PEFT methods?**
Key features, differences, and typical use cases based only on the provided information:

Llama 3.1
- Key features:
  - Provided as a family of LLaMA model classes and utilities (configuration and tokenizers).
  - Relevant classes include LlamaConfig, LlamaTokenizer / LlamaTokenizerFast (with methods for special tokens, token type ids, post-processing, and saving vocabulary), and model classes for various tasks (LlamaModel, LlamaForCausalLM, LlamaForSequenceClassification, LlamaForQuestionAnswering, LlamaForTokenClassification).
  - Flax implementations are available (FlaxLlamaModel, FlaxLlamaForCausalLM).
  - Typical forward/serve APIs are exposed (forward or __call__).
- Typical use cases:
  - Causal language modeling / text generation (LlamaForCausalLM).
  - Sequence classification, question answering, and token classification via dedicated model classes.
  - Deployments for text 

In [14]:
for src in response.source_nodes:
    print("Node ID\t", src.node_id)
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("-_" * 20)

Node ID	 781b7b12-eca2-47c0-a66e-9d6be670e951
Text	 on how to fine-tune LLaMA model using LoRA method via the 🤗 PEFT library with intuitive UI. 🌎 - A [notebook](https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/text-generation-open-llama.ipynb) on how to deploy Open-LLaMA model for text generation on Amazon SageMaker. 🌎 ## LlamaConfig[[autodoc]] LlamaConfig## LlamaTokenizer[[autodoc]] LlamaTokenizer    - build_inputs_with_special_tokens    - get_special_tokens_mask    - create_token_type_ids_from_sequences    - save_vocabulary## LlamaTokenizerFast[[autodoc]] LlamaTokenizerFast    - build_inputs_with_special_tokens    - get_special_tokens_mask    - create_token_type_ids_from_sequences    - update_post_processor    - save_vocabulary## LlamaModel[[autodoc]] LlamaModel    - forward## LlamaForCausalLM[[autodoc]] LlamaForCausalLM    - forward## LlamaForSequenceClassification[[autodoc]] LlamaForSequenceClassification    - 

# Test gemini-2.5-flash Multi-Step


In [15]:
from llama_index.core.indices.query.query_transform.base import StepDecomposeQueryTransform
from llama_index.core.query_engine.multistep_query_engine import MultiStepQueryEngine
from llama_index.llms.google_genai import GoogleGenAI

llm_gemini = GoogleGenAI(model="gemini-2.5-flash")

step_decompose_transform = StepDecomposeQueryTransform(llm=llm_gemini, verbose=True)

query_engine_gemini = vector_index.as_query_engine(llm=llm_gemini)

query_engine_gemini = MultiStepQueryEngine(
    query_engine=query_engine_gemini,
    query_transform=step_decompose_transform,
    index_summary="Used to answer the Questions about RAG, Machine Learning, Deep Learning,LLMs and Generative AI",
)

In [16]:
response_gemini = query_engine_gemini.query("Write about Llama 3.1 Model, BERT and PEFT")

[1;3;33m> Current query: Write about Llama 3.1 Model, BERT and PEFT
[0m[1;3;38;5;200m> New query: What is Llama 3.1 Model?
[0m[1;3;33m> Current query: Write about Llama 3.1 Model, BERT and PEFT
[0m[1;3;38;5;200m> New query: What is BERT?
[0m[1;3;33m> Current query: Write about Llama 3.1 Model, BERT and PEFT
[0m[1;3;38;5;200m> New query: What is PEFT?
[0m

In [17]:
response_gemini.response

'Llama 3.1 Model\n- Llama 3.1 405B is Meta’s largest model, trained on over 15 trillion tokens using more than 16,000 H100 GPUs — the first Llama model trained at that scale.\n- It supports a 128K context length, improving its ability to handle very long inputs.\n- Capabilities: enhanced reasoning and coding compared with earlier Llama versions; strong programming ability with high-quality code generation and solid understanding of syntax and logic; excels in logical reasoning, problem solving, analysis, and inference.\n- Multilingual: roughly 50% of its pretraining tokens are multilingual, enabling effective processing and understanding of multiple languages.\n- RAG & tool use: supports zero-shot tool use and can be used to build agentic behaviors with retrieval-augmented generation (RAG). In benchmarks it outperformed some contemporaries (e.g., GPT-4o and Claude 3.5 Sonnet) on mathematical reasoning, complex reasoning, multilingual support, and long-text processing (scoring 95.2 poin

## Test Retriever on Multistep


In [18]:
# import llama_index
# from llama_index.core.indices.query.schema import QueryBundle

# t = QueryBundle("How Retrieval Augmented Generation (RAG) work?")
# query_engine_gemini.retrieve(t)

## Subquestion Query Engine

In [19]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.question_gen.llm_generators import LLMQuestionGenerator
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-5-mini", additional_kwargs={'reasoning_effort':'minimal'})

question_gen = LLMQuestionGenerator.from_defaults(llm=llm)

query_engine = vector_index.as_query_engine()

query_engine_tools = [
    QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name="LlamaIndex",
            description="Used to answer the Questions about RAG, Machine Learning, Deep Learning, and Generative AI. Note: Don't repeat the Same question",
        ),
    ),
]

sub_question_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    question_gen=question_gen,
    use_async=True,
)

response = sub_question_engine.query("Write about Llama 3.1 Model, BERT and PEFT")


Generated 8 sub questions.
[1;3;38;2;237;90;200m[LlamaIndex] Q: What are the key features, architecture, and capabilities of Llama 3.1?
[0m[1;3;38;2;90;149;237m[LlamaIndex] Q: What are common use cases and performance characteristics of Llama 3.1 compared to other large language models?
[0m[1;3;38;2;11;159;203m[LlamaIndex] Q: What is BERT's architecture, training objective, and main contributions to NLP?
[0m[1;3;38;2;155;135;227m[LlamaIndex] Q: What are typical downstream use cases and performance strengths/limitations of BERT?
[0m[1;3;38;2;237;90;200m[LlamaIndex] Q: What is PEFT (Parameter-Efficient Fine-Tuning), its main methods (e.g., LoRA, adapters, prompt tuning), and how it works?
[0m[1;3;38;2;90;149;237m[LlamaIndex] Q: How is PEFT applied to large models like Llama 3.1 and BERT in practice, including trade-offs and example workflows?
[0m[1;3;38;2;11;159;203m[LlamaIndex] Q: What are recommended best practices for choosing between full fine-tuning and PEFT for differe

In [20]:
response.response

'Llama 3.1, BERT, and PEFT — concise overview and how they relate\n\nLlama 3.1 — key features, architecture, and capabilities\n- Model family and release\n  - A transformer-based large language model family that scales across multiple sizes (notably 405B, 70B, and 8B parameters).\n  - Open-source release enabling vendor independence, on‑prem or preferred-cloud hosting, and community-driven customization and fine‑tuning.\n- Architecture and scale\n  - Transformer encoder-decoder lineage derived from the Llama family but used as a causal/LM-style model family.\n  - The largest model (405B) was trained at extreme scale (trillions of tokens) using very large GPU fleets (examples up to ~16,000 H100 GPUs with 80GB HBM3), NVLink server topologies for the highest-scale runs, and huge distributed storage/throughput requirements. Smaller variants (70B, 8B) are much more accessible and use high-speed fabrics (InfiniBand / NVidia Quantum2) for distributed training/serving.\n- Context and capabilit

# HyDE Transform


In [21]:
query_engine = vector_index.as_query_engine()

In [22]:
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.query_engine.transform_query_engine import TransformQueryEngine

hyde = HyDEQueryTransform(include_original=True)
hyde_query_engine = TransformQueryEngine(query_engine, hyde)

In [23]:
response = hyde_query_engine.query("Write about Llama 3.1 Model, BERT and PEFT")

In [24]:
response.response

'I can only answer using the provided information. The excerpts you gave discuss Llama-Adapter (a PEFT method for LLaMA), LoRA usage for LLaMA via the 🤗 PEFT library, and related tooling. There is no information in the provided excerpts about "Llama 3.1" specifically or about BERT. Below is a summary of what the provided excerpts do cover and how PEFT is presented there.\n\nLLaMA-related PEFT (from the excerpts)\n- LLaMA-Adapter:\n  - Purpose: A parameter-efficient fine-tuning method to turn a frozen LLaMA model into an instruction-following model.\n  - Mechanism: Learns a set of adaptation prompts that are prefixed to input tokens and injected via a zero-initialized attention mechanism with zero gating so the frozen model’s pre-trained knowledge is preserved while adding instructional cues.\n  - Efficiency: Adds only 1.2M learnable parameters to a frozen LLaMA 7B model and fine-tunes in under one hour on 8 A100 GPUs (using 52K self-instruct examples).\n  - Performance and extensions: 

In [25]:
for src in response.source_nodes:
    print("Node ID\t", src.node_id)
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("-_" * 20)

Node ID	 e1e3e842-7160-40c4-8e74-772fb8254f5e
Text	 # Llama-Adapter[Llama-Adapter](https://hf.co/papers/2303.16199) is a PEFT method specifically designed for turning Llama into an instruction-following model. The Llama model is frozen and only a set of adaptation prompts prefixed to the input instruction tokens are learned. Since randomly initialized modules inserted into the model can cause the model to lose some of its existing knowledge, Llama-Adapter uses zero-initialized attention with zero gating to progressively add the instructional prompts to the model.The abstract from the paper is:*We present LLaMA-Adapter, a lightweight adaption method to efficiently fine-tune LLaMA into an instruction-following model. Using 52K self-instruct demonstrations, LLaMA-Adapter only introduces 1.2M learnable parameters upon the frozen LLaMA 7B model, and costs less than one hour for fine-tuning on 8 A100 GPUs. Specifically, we adopt a set of learnable adaption prompts, and prepend them to the in

In [26]:
query_bundle = hyde("Write about Llama 3.1 Model, BERT and PEFT")

In [27]:
hyde_doc = query_bundle.embedding_strs[0]

In [28]:
hyde_doc

'Llama 3.1, BERT, and PEFT each occupy important places in modern natural language processing; together they illustrate how foundational architectures, transfer learning, and parameter-efficient tuning interact to build capable, practical language models.\n\nLlama 3.1\nLlama 3.1 is a recent generation in Meta’s Llama family of large language models (LLMs). Like other Llama variants, it is a transformer-based autoregressive model that predicts the next token given prior context. Llama 3.1 advances prior releases in several dimensions: scale, training data curation, architectural refinements, and instruction-following ability. Key features and considerations include:\n- Model sizes and scaling: Llama 3.1 is available in multiple parameter scales (ranging from billions to hundreds of billions of parameters), enabling trade-offs between latency, memory footprint, and performance. Larger sizes generally yield better few-shot and zero-shot capabilities.\n- Training data and pretraining: It i