# Install Packages and Setup Variables


In [1]:
!pip install -q llama-index==0.10.57 openai==1.59.8 llama-index-finetuning==0.1.12 llama-index-embeddings-huggingface==0.2.3 llama-index-embeddings-cohere==0.1.9 llama-index-readers-web==0.1.23 cohere==5.6.2 tiktoken==0.7.0 chromadb==0.5.5 sentence-transformers==2.7.0 pydantic==2.10.0 llama-index-vector-stores-chroma==0.1.10 kaleido==0.2.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.8/167.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m

In [2]:
import os

# Set the following API Keys in the Python environment. Will be used later.
# os.environ["OPENAI_API_KEY"] = "<YOUR_OPENAI_KEY>"

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')

In [3]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.

import nest_asyncio

nest_asyncio.apply()

# Load Models


In [4]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.llm = OpenAI(temperature=1, model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Load Indexes


In [5]:
# Downloading Vector store from Hugging face hub
from huggingface_hub import hf_hub_download

vectorstore = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="vectorstore.zip", repo_type="dataset", local_dir=".")

vectorstore.zip:   0%|          | 0.00/97.2M [00:00<?, ?B/s]

In [6]:
!unzip -o vectorstore.zip

Archive:  vectorstore.zip
   creating: ai_tutor_knowledge/
   creating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/length.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/index_metadata.pickle  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/link_lists.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/header.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/data_level0.bin  
  inflating: ai_tutor_knowledge/chroma.sqlite3  


In [7]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

# Load the vector store from the local storage.
db = chromadb.PersistentClient(path="./ai_tutor_knowledge")
chroma_collection = db.get_or_create_collection("ai_tutor_knowledge")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
vector_index = VectorStoreIndex.from_vector_store(vector_store)

# Display result


In [8]:
# A simple function to show the response and the sources.
def display_res(response):
    print("Response:\n\t", response.response.replace("\n", ""))

    print("Sources:")
    if response.source_nodes:
        for src in response.source_nodes:
            print("\tNode ID\t", src.node_id)
            print("\tText\t", src.text)
            print("\tScore\t", src.score)
            print("\t" + "-_" * 20)
    else:
        print("\tNo sources used!")

# Chat Engine


In [9]:
# define the chat_engine by using the index
chat_engine = vector_index.as_chat_engine(llm=Settings.llm)

In [10]:
# First Question:
response = chat_engine.chat("Use the tool to answer, how does parameter efficient finetuning work?")

display_res(response)

Response:
	 Parameter Efficient Fine Tuning (PEFT) optimizes the fine-tuning process for large language models (LLMs) by making slight adjustments to a model's weights, significantly reducing computational costs compared to full fine-tuning, where every weight is adjusted. The approach involves starting with a pretrained model that already possesses extensive knowledge, then applying task-specific datasets for enhancement.PEFT employs three main strategies:1. **Selective**: Fine-tunes only a subset of the model's parameters, reducing the computational burden.2. **Reparameterization**: Utilizes low-rank representations to adjust model weights. An example is LoRA (Low Rank Adaptation), where weight matrices are decomposed into smaller, low-rank matrices, allowing for a significant reduction in the number of parameters that need to be trained while maintaining performance.3. **Additive**: Represents another method within PEFT for fine-tuning.Overall, PEFT aims to streamline the fine-tunin

In [11]:
# Second Question:
response = chat_engine.chat("Could you tell me a joke?")
display_res(response)

Response:
	 Sure! Here's a joke for you:Why did the scarecrow win an award?Because he was outstanding in his field!
Sources:
	No sources used!


In [12]:
# Third Question: (check if it can recall previous interactions)
response = chat_engine.chat("What was the first question I asked?")
display_res(response)

Response:
	 The first question you asked was, "How does parameter efficient finetuning work?"
Sources:
	No sources used!


In [13]:
# Reset the session to clear the memory
chat_engine.reset()

In [14]:
# Fourth Question: (don't recall the previous interactions.)
response = chat_engine.chat("What was the first question I asked?")
display_res(response)

Response:
	 The first question you asked was, "What was the first question I asked?"
Sources:
	No sources used!


# Streaming


In [15]:
# Stream the words as soon as they are available instead of waiting for the model to finish generation.
streaming_response = chat_engine.stream_chat(
    "Write a paragraph explaining how RAG and PEFT work, and highlight the differences between them."
)
streaming_response.print_response_stream()

Retrieval-Augmented Generation (RAG) and Parameter-Efficient Fine-Tuning (PEFT) are two techniques used in the realm of machine learning, particularly in natural language processing. RAG combines two primary components: retrieval and generation. It first retrieves relevant information from external sources through an indexing and searching process, leveraging either sparse or dense retrieval methods. After obtaining the necessary context, the generation component produces coherent and contextually relevant responses, integrating the retrieved information without requiring extensive fine-tuning. In contrast, PEFT allows for fine-tuning large pretrained models by only adjusting a small subset of additional parameters, rather than the entire model. This parameter-efficient approach drastically reduces computational and storage costs while still delivering performance comparable to fully fine-tuned models. The main difference lies in their approach: RAG focuses on enhancing generative capa

## Condense Question


Enhance the input prompt by looking at the previous chat history along with the present question. The refined prompt can then be used to fetch the nodes.


In [16]:
# Define GPT-4 model that will be used by the chat_engine to improve the query.
gpt4 = OpenAI(temperature=0.9, model="gpt-4o")

In [17]:
chat_engine = vector_index.as_chat_engine(
    chat_mode="condense_question", llm=gpt4, verbose=True
)

In [18]:
response = chat_engine.chat(
    "How does Retrieval-Augmented Generation (RAG) work, and which problem does it solve?"
)
display_res(response)

Querying with: How does Retrieval-Augmented Generation (RAG) work, and which problem does it solve?
Response:
	 Retrieval-Augmented Generation (RAG) works by combining pretraining and retrieval-based models to enhance the performance of generative models. It involves several key processing steps, including query classification, retrieval of relevant documents, reranking these documents based on their relevance, repacking them into a structured format, and summarizing the key information for response generation. This approach solves the problem of generative large language models producing outdated information or fabricating facts by integrating up-to-date and relevant content from external knowledge sources, thereby improving the accuracy and reliability of the generated information.
Sources:
	Node ID	 2aa05360-f43a-4819-bce7-0acf7b897eab
	Text	 Generative large language models are prone to producing outdated information or fabricating facts, although they were aligned with human prefe

## ReAct


ReAct is an agent-based chat mode that uses a loop to decide on querying a data engine during interactions, offering flexibility but relying on the Large Language Model's quality for effective responses, requiring careful management to avoid inaccurate answers.


In [19]:
chat_engine = vector_index.as_chat_engine(chat_mode="react", verbose=True)

In [20]:
response = chat_engine.chat(
    "Which company developed Claude 3.5 Sonnet, and what is its primary application?"
)

Added user message to memory: Which company developed Claude 3.5 Sonnet, and what is its primary application?
=== Calling Function ===
Calling function: query_engine_tool with args: {"input":"Which company developed Claude 3.5 Sonnet, and what is its primary application?"}
Got output: Claude 3.5 Sonnet was developed by Anthropic. Its primary application is to serve as a free-tier model that balances cost and features, making it suitable for tasks like creative writing and answering questions.



In [21]:
display_res(response)

Response:
	 Claude 3.5 Sonnet was developed by Anthropic. Its primary application is to serve as a free-tier model that balances cost and features, making it suitable for tasks like creative writing and answering questions.
Sources:
	Node ID	 4558b7d3-7f77-4f55-b0e7-64d385820117
	Text	 2. Dedicated to safety and security   It is a well-known fact that Anthropic prioritizes responsible AI development the most  and it is clearly seen in Claudes design. This generative AI model is trained on a carefully curated dataset thus it minimizes biases and factual errors to a large extent. On top of that  Claude also undergoes rigorous safety checks to prevent the generation of harmful and misleading content.   3. Emphasizes Explainability   While many of the AI and LLMs currently operate as black boxes  Claude offers a high level of explainability surpassing other models. This means it can explain the reasoning and decision-making process behind all of its responses. Therefore  it helps users to 