# Install Packages and Setup Variables


In [1]:
!pip install -q llama-index==0.12.21 openai==1.59.8 cohere==5.6.2 tiktoken==0.7.0 chromadb==0.6.0 html2text==2024.2.26 sentence-transformers==3.3.1 pydantic==2.10.5 llama-index-vector-stores-chroma==0.4.1 kaleido==0.2.1 llama-index-llms-gemini==0.4.1

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m455.6/455.6 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os

# Set the following API Keys in the Python environment. Will be used later.
# os.environ["OPENAI_API_KEY"] = "<YOUR_OPENAI_API_KEY>"
# os.environ["GOOGLE_API_KEY"] = "<YOUR_GOOGLE_API_KEY>"

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')
os.environ["GOOGLE_API_KEY"] = userdata.get('Google_api_key')

In [3]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.
import nest_asyncio

nest_asyncio.apply()

# Load a Model


In [4]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.llm = OpenAI(temperature=1, model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Load Indexes


In [5]:
# Downloading Vector store from Hugging face hub
from huggingface_hub import hf_hub_download

vectorstore = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="vectorstore.zip", repo_type="dataset", local_dir=".")

vectorstore.zip:   0%|          | 0.00/97.2M [00:00<?, ?B/s]

In [6]:
!unzip vectorstore.zip

Archive:  vectorstore.zip
   creating: ai_tutor_knowledge/
   creating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/length.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/index_metadata.pickle  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/link_lists.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/header.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/data_level0.bin  
  inflating: ai_tutor_knowledge/chroma.sqlite3  


In [7]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

# Create your index
db = chromadb.PersistentClient(path="./ai_tutor_knowledge")
chroma_collection = db.get_or_create_collection("ai_tutor_knowledge")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
vector_index = VectorStoreIndex.from_vector_store(vector_store)



In [8]:
# Query Engine
ai_tutor_knowledge_query_engine = vector_index.as_query_engine(similarity_top_k=3)

res = ai_tutor_knowledge_query_engine.query("How does Retrieval Augmented Generation (RAG) work?")
print(res.response)

Retrieval Augmented Generation (RAG) operates through a structured workflow that integrates two main components: retrieval and generation. 

1. **Retrieval Component**:
   - The process begins with **query classification**, where it is determined whether retrieval is necessary for a given input query. 
   - Then, the system engages in **retrieval**, which efficiently obtains relevant documents. This involves indexing documents so they can be fetched based on user queries, utilizing techniques such as inverted indexes for sparse retrieval or dense vector encodings for dense retrieval. 
   - Following the retrieval, there is **reranking** to refine the order of the fetched documents based on their relevance to the query. 
   - The selected documents are then **repacked** into a structured format to enhance the quality of information.
   - Finally, a **summarization** step extracts key information from the repacked documents, ensuring that redundancies are eliminated for effective generat

In [9]:
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Metadata\t", src.metadata)
    print("-_" * 20)

Node ID	 2aa05360-f43a-4819-bce7-0acf7b897eab
Title	 Searching for Best Practices in Retrieval-Augmented Generation:1 Introduction
Text	 Generative large language models are prone to producing outdated information or fabricating facts, although they were aligned with human preferences by reinforcement learning [1] or lightweight alternatives [2–5]. Retrieval-augmented generation (RAG) techniques address these issues by combining the strengths of pretraining and retrieval-based models, thereby providing a robust framework for enhancing model performance [6]. Furthermore, RAG enables rapid deployment of applications for specific organizations and domains without necessitating updates to the model parameters, as long as query-related documents are provided. Many RAG approaches have been proposed to enhance large language models (LLMs) through query-dependent retrievals [6–8]. A typical RAG workflow usually contains multiple intervening processing steps: query classification (determining w

# Router

Routers are modules that take in a user query and a set of “choices” (defined by metadata), and returns one or more selected choices.

They can be used for the following use cases and more:

- Selecting the right data source among a diverse range of data sources

- Deciding whether to do summarization (e.g. using summary index query engine) or semantic search (e.g. using vector index query engine)

- Deciding whether to “try” out a bunch of choices at once and combine the results (using multi-routing capabilities).


## Lets create a different query engine with Mistral AI information


In [10]:
from pathlib import Path
import requests

wiki_titles = [
    "Mistral AI",
    "Llama (language model)",
    "Claude AI",
    "OpenAI",
    "Gemini AI",
]

data_path = Path("llm_data_wiki")

if not data_path.exists():
    data_path.mkdir()

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            "explaintext": True,
        },
    ).json()

    page = next(iter(response["query"]["pages"].values()))

    if "extract" in page:
        wiki_text = page["extract"]

        with open(data_path / "llm_data_wiki.txt", "a") as fp:
            fp.write(f"Title: {title}\n{wiki_text}\n\n")
    else:
        print(f"No extract found for '{title}'")


In [11]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.text_splitter import TokenTextSplitter
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    KeywordExtractor,
)

# Assuming you have prepared a directory for llm data
documents = SimpleDirectoryReader("llm_data_wiki").load_data()

text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)

transformations = [
    text_splitter,
    QuestionsAnsweredExtractor(questions=2),
    SummaryExtractor(summaries=["prev", "self"]),
    KeywordExtractor(keywords=10),
    OpenAIEmbedding(model="text-embedding-3-small"),
]

llm_index = VectorStoreIndex.from_documents(documents=documents, transformations=transformations)

llm_query_engine = llm_index.as_query_engine(similarity_top_k=2)

100%|██████████| 50/50 [00:28<00:00,  1.74it/s]
100%|██████████| 50/50 [01:09<00:00,  1.39s/it]
100%|██████████| 50/50 [00:13<00:00,  3.79it/s]


In [12]:
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import PydanticSingleSelector
from llama_index.core.tools import QueryEngineTool

# initialize tools
ai_tutor_knowledge_tool = QueryEngineTool.from_defaults(
    query_engine=ai_tutor_knowledge_query_engine,
    description="Useful for questions about general generative AI concepts",
)
llm_tool = QueryEngineTool.from_defaults(
    query_engine=llm_query_engine,
    description="Useful for questions about particular LLMs like Mistral, Claude, OpenAI, Gemini",
)

# initialize router query engine (single selection, pydantic)
query_engine = RouterQueryEngine(
    selector=PydanticSingleSelector.from_defaults(),
    query_engine_tools=[
        ai_tutor_knowledge_tool,
        llm_tool,
    ],
)

In [13]:
res = query_engine.query(
    "What is the LLama model?",
)
print(res.response)

The LLaMA model is a foundational AI model developed by Meta Platforms. It has various applications across different domains, such as healthcare, business, and military. The model has undergone improvements and fine-tuning to enhance its performance and address safety and ethical considerations. Key variations of the model include LLaMA 1 and LLaMA 2, with advancements in training methodologies, data usage, and multi-turn dialogue consistency. There have also been initiatives to create derivative models, such as Alpaca, which aim to achieve comparable capabilities to popular models like OpenAI's GPT-3. The LLaMA model serves as a base for several applications, including AI companions for business, and efforts have been made to re-implement it in various programming languages to facilitate broader usage.


In [14]:
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("-_" * 20)

Node ID	 117a217d-d132-41ef-ba05-574d58e5512f
Text	 out the loss function for tokens in the prompt (earlier parts of the dialog).


== Applications ==
The Stanford University Institute for Human-Centered Artificial Intelligence (HAI) Center for Research on Foundation Models (CRFM) released Alpaca, a training recipe based on the LLaMA 7B model that uses the "Self-Instruct" method of instruction tuning to acquire capabilities comparable to the OpenAI GPT-3 series text-davinci-003 model at a modest cost. The model files were officially removed on March 21, 2023, over hosting costs and safety concerns, though the code and paper remain online for reference.
Meditron is a family of Llama-based finetuned on a corpus of clinical guidelines, PubMed papers, and articles. It was created by researchers at École Polytechnique Fédérale de Lausanne School of Computer and Communication Sciences, and the Yale School of Medicine. It shows increased performance on medical-related benchmarks such as MedQA

In [15]:
res = query_engine.query("Explain parameter-efficient finetuning methods")
print(res.response)

Parameter-efficient fine-tuning methods aim to optimize the training of large language models while significantly reducing computational costs and resource requirements. These methods allow for adjustments to model weights without needing to change every parameter, which is particularly useful for working with extensive pre-trained models. 

There are three main approaches to parameter-efficient fine-tuning:

1. **Selective Fine-Tuning**: This approach involves choosing a subset of the model's parameters to fine-tune, allowing for targeted adjustments while keeping the majority of the model's weights unchanged.

2. **Reparameterization**: Techniques like Low Rank Adaptation (LoRA) fall into this category. LoRA approximates the original weight matrices using smaller matrices, reducing the total number of trainable parameters. For instance, a weight matrix can be decomposed into two smaller matrices, significantly lowering the parameter count while still maintaining model performance.

3

In [16]:
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("-_" * 20)

Node ID	 6be88fa3-2f8b-43e7-aba0-d874b39809fc
Text	 # FourierFT: Discrete Fourier Transformation Fine-Tuning[FourierFT](https://huggingface.co/papers/2405.03003) is a parameter-efficient fine-tuning technique that leverages Discrete Fourier Transform to compress the model's tunable weights. This method outperforms LoRA in the GLUE benchmark and common ViT classification tasks using much less parameters.FourierFT currently has the following constraints:- Only `nn.Linear` layers are supported.- Quantized layers are not supported.If these constraints don't work for your use case, consider other methods instead.The abstract from the paper is:> Low-rank adaptation (LoRA) has recently gained much interest in fine-tuning foundation models. It effectively reduces the number of trainable parameters by incorporating low-rank matrices A and B to represent the weight change, i.e., Delta W=BA. Despite LoRA's progress, it faces storage challenges when handling extensive customization adaptations or 

# OpenAI Agent

In [17]:
from llama_index.agent.openai import OpenAIAgent

In [18]:
system_message_openai_agent = """You are an AI teacher, answering questions from students of an applied AI course on Large Language Models (LLMs or llm) and Retrieval Augmented Generation (RAG) for LLMs. Topics covered include training models, fine-tuning models, giving memory to LLMs, prompting tips, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, LlamaIndex, making LLMs interact with tools, AI agents, reinforcement learning with human feedback. Questions should be understood in this context.

Your answers are aimed to teach students, so they should be complete, clear, and easy to understand.

Use the available tools to gather insights pertinent to the field of AI. Always use two tools at the same time. These tools accept a string (a user query rewritten as a statement) and return informative content regarding the domain of AI.
e.g:
User question: 'How can I fine-tune an LLM?'
Input to the tool: 'Fine-tuning an LLM'

User question: How can quantize an LLM?
Input to the tool: 'Quantization for LLMs'

User question: 'Teach me how to build an AI agent"'
Input to the tool: 'Building an AI Agent'

Only some information returned by the tools might be relevant to the question, so ignore the irrelevant part and answer the question with what you have.

Your responses are exclusively based on the output provided by the tools. Refrain from incorporating information not directly obtained from the tool's responses.

When the conversation deepens or shifts focus within a topic, adapt your input to the tools to reflect these nuances. This means if a user requests further elaboration on a specific aspect of a previously discussed topic, you should reformulate your input to the tool to capture this new angle or more profound layer of inquiry.

Provide comprehensive answers, ideally structured in multiple paragraphs, drawing from the tool's variety of relevant details. The depth and breadth of your responses should align with the scope and specificity of the information retrieved.

Should the tools repository lack information on the queried topic, politely inform the user that the question transcends the bounds of your current knowledge base, citing the absence of relevant content in the tool's documentation.

At the end of your answers, always invite the students to ask deeper questions about the topic if they have any. Make sure to reformulate the question to the tool to capture this new angle or more profound layer of inquiry.

Do not refer to the documentation directly, but use the information provided within it to answer questions.

If code is provided in the information, share it with the students. It's important to provide complete code blocks so they can execute the code when they copy and paste them.

Make sure to format your answers in Markdown format, including code blocks and snippets.

Politely reject questions not related to AI, while being cautious not to reject unfamiliar terms or acronyms too quickly."""

In [19]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o")

agent = OpenAIAgent.from_tools(
    llm=llm,
    tools=[ai_tutor_knowledge_tool, llm_tool],
    system_prompt=system_message_openai_agent,
)

In [20]:
response = agent.chat("What is the LLama model?")
print(response.response)

The LLaMA model, developed by Meta, is a large language model designed to excel in natural language processing tasks. Initially announced on February 24, 2023, LLaMA was created to outperform larger models like GPT-3, despite having fewer parameters. The model's architecture was designed to be efficient across various hardware setups, making it accessible for a wide range of applications.

LLaMA's initial version featured 13 billion parameters and was trained on a diverse dataset of 1.4 trillion tokens sourced from public information, including CommonCrawl, GitHub, and Wikipedia. This training approach allowed LLaMA to achieve significant performance on NLP benchmarks. However, access to its model weights was initially restricted, granted only to select researchers and organizations. A notable event in its history was the leak of its weights on March 3, 2023, which led Meta to file DMCA takedown requests against unauthorized distributions.

Following the initial release, Meta introduce

In [21]:
response = agent.chat("Explain parameter-efficient finetuning methods")
print(response.response)

Parameter-efficient fine-tuning methods for large language models (LLMs) are designed to adapt models to specific tasks without the need to retrain the entire model, thereby saving computational resources and time. Here are some techniques used in this context:

1. **Supervised Fine-Tuning with Autoregressive Loss**: This method involves fine-tuning the model using a loss function that predicts the next token in a sequence, given the previous tokens. This approach helps tailor the model's behavior to specific user inputs while maintaining the context length within specified limits. It is particularly useful for tasks where the sequence prediction is crucial.

2. **Rejection Sampling**: This technique is used during the fine-tuning process to improve the quality of the generated outputs. By sampling multiple outputs and selecting the best one according to a predefined criterion, the model can be fine-tuned more effectively without altering a large number of parameters.

3. **Reinforceme

In [22]:
response = agent.chat("Write the recipe for a chocolate cake.")
print(response.response)

I'm here to assist with questions related to AI and large language models. If you have any questions about those topics, feel free to ask!


# Code related questions to GPT-4o, the remaining questions to Gemini

In [29]:
from llama_index.agent.openai import OpenAIAgent
from llama_index.llms.openai import OpenAI
from llama_index.llms.gemini import Gemini
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import PydanticSingleSelector
from llama_index.core.tools import QueryEngineTool

# initialize LLMs
gpt_4o_llm = OpenAI(model="gpt-4o")
gemini_llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=512)

# define query engines
llm_query_engine_code = vector_index.as_query_engine(
    llm=gpt_4o_llm,
    similarity_top_k=3,
    embed_model=OpenAIEmbedding(model="text-embedding-3-small", mode="text_search"),
)

llm_query_engine_rest = vector_index.as_query_engine(
    llm=gemini_llm,
    similarity_top_k=3,
    embed_model=OpenAIEmbedding(model="text-embedding-3-small", mode="text_search"),
)

# define tools for LLM
llm_tool_code = QueryEngineTool.from_defaults(
    query_engine=llm_query_engine_code,
    description="Ideal for handling code-related queries, technical implementations, and troubleshooting involving Large Language Models.",
    name="LLMCodeTool",
)

llm_tool_rest = QueryEngineTool.from_defaults(
    query_engine=llm_query_engine_rest,
    description="Best suited for answering conceptual, theoretical, and general questions about Large Language Models.",
    name="LLMGeneralTool",

)


system_message_openai_agent_tools = """
You are a highly knowledgeable assistant specialized in Large Language Models. Your primary role is to assist users by providing accurate, detailed, and context-specific responses. You have access to two specialized tools:

1. **LLMCodeTool** – Use this tool when the query involves code-related tasks, technical implementations, debugging, or troubleshooting issues in code.
2. **LLMGeneralTool** – Use this tool for answering conceptual, theoretical, or general questions about Large Language Models that do not involve code specifics.

When a query is received:
- First, decide which tool best fits the user's request.
- If the question is technical or code-oriented, route the query to LLMCodeTool.
- If the question is more general or conceptual, route the query to LLMGeneralTool.
- If the query does not clearly fall into either category, provide a direct answer using your own capabilities.

Always ensure your responses are clear, concise, and directly address the user’s needs. Maintain a professional tone and provide detailed explanations where necessary.
"""

# Initialize OpenAIAgent with the system message and the router query engine
agent = OpenAIAgent.from_tools(
    llm=gpt_4o_llm,  # The base LLM, used only if no other tools apply
    tools=[llm_tool_code, llm_tool_rest],
    system_prompt=system_message_openai_agent_tools,
    # verbose = True, # Enable detailed output for debugging and better traceability
)

In [30]:
# Test the agent with a code-related question
response = agent.chat("How do I fine-tune the LLama model? Write the code for it.")
for source in response.sources:
    print(source.tool_name)

LLMCodeTool


In [31]:
# Test the agent with a Non code-related question
response1 = agent.chat("What is the relationship between Llama models and Meta")
for source in response1.sources:
    print(source.tool_name)

LLMGeneralTool
