# Install Packages and Setup Variables


In [1]:
!pip install -q llama-index==0.12.21 openai==1.59.8 tiktoken==0.7.0 chromadb==0.6.0 sentence-transformers==3.3.1 pydantic==2.10.5 llama-index-vector-stores-chroma==0.4.1 kaleido==0.2.1

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m455.6/455.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m602.1/602.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:

In [2]:
import os

# Set the following API Keys in the Python environment. Will be used later.
os.environ["OPENAI_API_KEY"] = "<YOUR_OPENAI_KEY>"

# from google.colab import userdata
# os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')

In [3]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.

import nest_asyncio

nest_asyncio.apply()

# Load Models


In [4]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.llm = OpenAI(temperature=1, model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Load Indexes


In [5]:
# Downloading Vector store from Hugging face hub
from huggingface_hub import hf_hub_download

vectorstore = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="vectorstore.zip", repo_type="dataset", local_dir=".")

vectorstore.zip:   0%|          | 0.00/97.2M [00:00<?, ?B/s]

In [6]:
!unzip -o vectorstore.zip

Archive:  vectorstore.zip
   creating: ai_tutor_knowledge/
   creating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/length.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/index_metadata.pickle  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/link_lists.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/header.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/data_level0.bin  
  inflating: ai_tutor_knowledge/chroma.sqlite3  


In [7]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

# Load the vector store from the local storage.
db = chromadb.PersistentClient(path="./ai_tutor_knowledge")
chroma_collection = db.get_or_create_collection("ai_tutor_knowledge")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
vector_index = VectorStoreIndex.from_vector_store(vector_store)



# Display result


In [8]:
# A simple function to show the response and the sources.
def display_res(response):
    print("Response:\n\t", response.response.replace("\n", ""))

    print("Sources:")
    if response.source_nodes:
        for src in response.source_nodes:
            print("\tNode ID\t", src.node_id)
            print("\tText\t", src.text)
            print("\tScore\t", src.score)
            print("\t" + "-_" * 20)
    else:
        print("\tNo sources used!")

# Chat Engine


In [23]:
# define the chat_engine by using the index
chat_engine = vector_index.as_chat_engine()

In [24]:
# First Question:
response = chat_engine.chat("Use the tool to answer, how does parameter efficient finetuning work?")

display_res(response)

Response:
	 Parameter efficient fine-tuning (PEFT) works by making minimal adjustments to the weights of a pre-trained large language model (LLM) while avoiding the computational costs associated with full fine-tuning, which alters every weight in the model. This method capitalizes on the knowledge already embedded in a pretrained model and fine-tunes it using task-specific datasets.Key approaches in PEFT include:1. **Selective Fine-Tuning**: This involves choosing a subset of the model's parameters to fine-tune rather than the entire model, which reduces computational requirements.2. **Reparameterization**: This method adjusts model weights using low-rank representations to minimize the number of parameters that need to be trained, efficiently representing weight changes without overwhelming the model.3. **Additive Method**: This approach involves adding new parameters or structures to the model that facilitate fine-tuning without extensive alterations to the existing weights.Overall,

In [25]:
# Second Question:
response = chat_engine.chat("Could you tell me a joke?")
display_res(response)

Response:
	 Sure! Here’s a joke for you:Why did the scarecrow win an award?Because he was outstanding in his field!
Sources:
	No sources used!


In [26]:
# Third Question: (check if it can recall previous interactions)
response = chat_engine.chat("What was the first question I asked?")
display_res(response)

Response:
	 The first question you asked was about how parameter efficient fine-tuning works.
Sources:
	No sources used!


In [30]:
# Reset the session to clear the memory
chat_engine.reset()

In [31]:
# Fourth Question: (don't recall the previous interactions.)
response = chat_engine.chat("What was the first question I asked?")
display_res(response)

Response:
	 Your first question was, "What was the first question I asked?"
Sources:
	No sources used!


# Streaming


In [15]:
# Stream the words as soon as they are available instead of waiting for the model to finish generation.
streaming_response = chat_engine.stream_chat(
    "Write a paragraph explaining how RAG and PEFT work, and highlight the differences between them."
)
streaming_response.print_response_stream()

Retrieval-Augmented Generation (RAG) and Parameter-Efficient Fine-Tuning (PEFT) are both methodologies in the realm of natural language processing, but they serve different purposes and employ distinct approaches. RAG combines a retrieval system with a generative model to improve output quality on knowledge-intensive tasks. It features two main components: the retrieval aspect, which extracts relevant information from external databases (like indexed documents), and the generation aspect, which produces coherent responses using the retrieved content. This dual mechanism allows RAG to leverage both parametric memory from the generative model and non-parametric memory from the retrieval system, addressing the limitations of traditional models by providing more accurate and up-to-date information. In contrast, PEFT focuses on the efficient adaptation of large pre-trained models to specific tasks by only fine-tuning a small number of additional parameters, rather than adjusting all model p

## Condense Question


Enhance the input prompt by looking at the previous chat history along with the present question. The refined prompt can then be used to fetch the nodes.


In [16]:
# Define GPT-4 model that will be used by the chat_engine to improve the query.
gpt4 = OpenAI(temperature=0.9, model="gpt-4o")

In [17]:
chat_engine = vector_index.as_chat_engine(
    chat_mode="condense_question", llm=gpt4, verbose=True
)

In [18]:
response = chat_engine.chat(
    "How does Retrieval-Augmented Generation (RAG) work, and which problem does it solve?"
)
display_res(response)

Querying with: How does Retrieval-Augmented Generation (RAG) work, and which problem does it solve?
Response:
	 Retrieval-Augmented Generation (RAG) works by integrating pretraining and retrieval-based models to enhance the performance of large language models (LLMs). It involves key steps such as query classification, retrieval of relevant documents, reranking to refine these documents' order, repacking to organize them for better generation, and summarization to extract key information. This process enables RAG to address challenges faced by LLMs, such as producing outdated information and fabricating facts, thus improving the accuracy and reliability of generated content. Additionally, RAG allows for the rapid deployment of applications without needing to update model parameters, provided relevant documents are available.
Sources:
	Node ID	 2aa05360-f43a-4819-bce7-0acf7b897eab
	Text	 Generative large language models are prone to producing outdated information or fabricating facts, a

## ReAct


ReAct is an agent-based chat mode that uses a loop to decide on querying a data engine during interactions, offering flexibility but relying on the Large Language Model's quality for effective responses, requiring careful management to avoid inaccurate answers.


In [19]:
chat_engine = vector_index.as_chat_engine(chat_mode="react", verbose=True)

In [20]:
response = chat_engine.chat(
    "Which company developed Claude 3.5 Sonnet, and what is its primary application?"
)

Added user message to memory: Which company developed Claude 3.5 Sonnet, and what is its primary application?
=== Calling Function ===
Calling function: query_engine_tool with args: {"input":"Which company developed Claude 3.5 Sonnet, and what is its primary application?"}
Got output: The information provided does not specify which company developed Claude 3.5 Sonnet or its primary application.

=== Calling Function ===
Calling function: query_engine_tool with args: {"input":"Claude 3.5 Sonnet developer and primary application"}
Got output: The information provided does not cover details about Claude 3.5, its development, or its primary application. Please provide more specific context or related information for a focused response.

=== Calling Function ===
Calling function: query_engine_tool with args: {"input":"Claude AI model details including Claude 3.5 Sonnet, its developer and application."}
Got output: Claude AI, developed by Anthropic, is a generative AI model that emphasizes s

In [21]:
display_res(response)

Response:
	 Claude 3.5 Sonnet was developed by Anthropic. Its primary application is in generative AI tasks, particularly for creative writing and answering questions, providing users with reliable assistance in a free-tier option that balances cost and features.
Sources:
	Node ID	 55740ef4-3809-4dfa-ad06-e85bac4e165f
	Text	 seeing. Most visual perception is handled by low-level processes that merely tell your brain "that\'s a water droplet" without telling you details like where the lightest and darkest points are, or "that\'s a bush" without telling you the shape and position of every leaf. This is a feature of brains, not a bug. In everyday life it would be distracting to notice every leaf on every bush. But when you have to paint something, you have to look more closely, and when you do there\'s a lot to see. You can still be noticing new things after days of trying to paint something people usually take for granted, just as you can after days of trying to write an essay about some