In [1]:
!pip install -q llama-index==0.14.0 openai==1.107.0 llama-index-tools-google==0.6.0 \
                newspaper4k==0.9.3.1 lxml-html-clean==0.4.2 jedi==0.19.2

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.6/296.6 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m101.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.3/303.3 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00

In [2]:
import os

# Set the following API Keys in the Python environment. Will be used later.
# os.environ["OPENAI_API_KEY"] = "[OPENAI_API_KEY]"
# GOOGLE_SEARCH_KEY = "GOOGLE_SEARCH_KEY"
# GOOGLE_SEARCH_ENGINE = "GOOGLE_SEARCH_ENGINE" # Search Engine ID

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
GOOGLE_SEARCH_KEY = userdata.get('GOOGLE_SEARCH_KEY')
GOOGLE_SEARCH_ENGINE = userdata.get('GOOGLE_SEARCH_ENGINE')

## LLM and Embedding Model

In [3]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI

Settings.llm = OpenAI(model="gpt-5-mini", additional_kwargs={'reasoning_effort':'minimal'})
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

# Using Agents/Tools


In [4]:
from llama_index.core.agent.workflow import ReActAgent
from llama_index.core.workflow import Context

# define sample Tool
def multiply(a: int, b: int) -> int:
    """Multiply two integers and returns the result integer"""
    return a * b

# initialize ReAct agent
agent = ReActAgent(tools=[multiply], verbose=True)

# Create a context to store the conversation history/session state
ctx = Context(agent)

In [5]:
from llama_index.core.agent.workflow import AgentStream, ToolCallResult

handler = agent.run("What is the multiplication of 43 and 45?", ctx=ctx)

async for ev in handler.stream_events():
    # if isinstance(ev, ToolCallResult):
    #     print(f"\nCall {ev.tool_name} with {ev.tool_kwargs}\nReturned: {ev.tool_output}")
    if isinstance(ev, AgentStream):
        print(f"{ev.delta}", end="", flush=True)

response = await handler

Running step init_run
Step init_run produced event AgentInput
Running step setup_agent
Step setup_agent produced event AgentSetup
Running step run_agent_step
Thought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: multiply
Action Input: {"a": 43, "b": 45}Step run_agent_step produced event AgentOutput
Running step parse_agent_output
Step parse_agent_output produced no event
Running step call_tool
Step call_tool produced event ToolCallResult
Running step aggregate_tool_results
Step aggregate_tool_results produced event AgentInput
Running step setup_agent
Step setup_agent produced event AgentSetup
Running step run_agent_step
Thought: I can answer without using any more tools. I'll use the user's language to answer
Answer: 43 × 45 = 1,935Step run_agent_step produced event AgentOutput
Running step parse_agent_output
Step parse_agent_output produced event StopEvent


In [6]:
print(str(response))

43 × 45 = 1,935


## Define Google Search Tool


In [7]:
from llama_index.tools.google import GoogleSearchToolSpec

tool_spec = GoogleSearchToolSpec(key=GOOGLE_SEARCH_KEY, engine=GOOGLE_SEARCH_ENGINE)

In [8]:
# Import and initialize our tool spec
from llama_index.core.tools.tool_spec.load_and_search import LoadAndSearchToolSpec

# Wrap the google search tool to create an index on top of the returned Google search
wrapped_search_tool = LoadAndSearchToolSpec.from_defaults(
    tool_spec.to_tool_list()[0],
).to_tool_list()

## Create the Agent


In [19]:
from llama_index.core.agent.workflow import FunctionAgent

# System prompt encouraging tool usage
system_prompt = """You are a helpful assistant that can search the web for current information.
When you don't have information about recent events or models released after your knowledge cutoff,
use the available search tools to find accurate, up-to-date information."""

# Create agent with proper configuration
search_agent = FunctionAgent(
    tools=wrapped_search_tool,
    llm=Settings.llm,
    system_prompt=system_prompt,
    verbose=True
)

ctx = Context(search_agent)

handler = search_agent.run("How many parameters LLaMA 4 model has? List the models with parameters", ctx=ctx)

async for ev in handler.stream_events():
    if isinstance(ev, ToolCallResult):
        print(f"\nCall {ev.tool_name} with {ev.tool_kwargs}\nReturned: {ev.tool_output}")
    if isinstance(ev, AgentStream):
        print(f"{ev.delta}", end="", flush=True)

response = await handler

Running step init_run
Step init_run produced event AgentInput
Running step setup_agent
Step setup_agent produced event AgentSetup
Running step run_agent_step
Step run_agent_step produced event AgentOutput
Running step parse_agent_output
Step parse_agent_output produced no event
Running step call_tool
Step call_tool produced event ToolCallResult

Call google_search with {'query': "LLaMA 4 model parameter counts list models 'LLaMA 4' sizes"}
Returned: Content loaded! You can now search the information using read_google_search
Running step aggregate_tool_results
Step aggregate_tool_results produced event AgentInput
Running step setup_agent
Step setup_agent produced event AgentSetup
Running step run_agent_step
Step run_agent_step produced event AgentOutput
Running step parse_agent_output
Step parse_agent_output produced no event
Running step call_tool
Step call_tool produced event ToolCallResult

Call read_google_search with {'query': "LLaMA 4 models and parameter counts list 'LLaMA 4' 'pa

In [20]:
print(f"\nFinal Response: {response}")


Final Response: LLaMA 4 is released in two sizes:

- Llama 4 Scout — 109 billion parameters  
- Llama 4 Maverick — 400 billion parameters


In [21]:
print(f"Tool Calls Made: {response.tool_calls}")

Tool Calls Made: [ToolCallResult(tool_name='google_search', tool_kwargs={'query': "LLaMA 4 model parameter counts list models 'LLaMA 4' sizes"}, tool_id='call_EpCwVMc5sQw1lYkLVqXNnD92', tool_output=ToolOutput(blocks=[TextBlock(block_type='text', text='Content loaded! You can now search the information using read_google_search')], tool_name='google_search', raw_input={'args': (), 'kwargs': {'query': "LLaMA 4 model parameter counts list models 'LLaMA 4' sizes"}}, raw_output='Content loaded! You can now search the information using read_google_search', is_error=False), return_direct=False), ToolCallResult(tool_name='read_google_search', tool_kwargs={'query': "LLaMA 4 models and parameter counts list 'LLaMA 4' 'parameter' 'models' 'sizes'"}, tool_id='call_XzdsoQJdOtO9kqzKJudv8nFS', tool_output=ToolOutput(blocks=[TextBlock(block_type='text', text='LLaMA 4 comes in two sizes: Llama 4 Scout (109B parameters) and Llama 4 Maverick (400B parameters).')], tool_name='read_google_search', raw_input

# Using Tools w/ VectorStoreIndex


A limitation of the current agent/tool in LlamaIndex is that it **relies solely on the page description from the retrieved pages** to answer questions. This approach will miss answers that are not visible in the page's description tag. To address this, a possible workaround is to fetch the page results, extract the page content using the newspaper3k library, and then create an index based on the downloaded content. Also, the previous method stacks all retrieved items from the search engine into a single document, making it **difficult to pinpoint the exact source** of the response. However, the following method will enable us to present the sources easily.


## Define Google Search Tool


In [12]:
from llama_index.tools.google import GoogleSearchToolSpec

tool_spec = GoogleSearchToolSpec(key=GOOGLE_SEARCH_KEY, engine=GOOGLE_SEARCH_ENGINE)

In [13]:
search_results = tool_spec.google_search("LLaMA 4 model details")

print(f"Found {len(search_results)} results")

Found 10 results


In [14]:
print(search_results)

[{'title': 'The Llama 4 herd: The beginning of a new era of natively ...', 'link': 'https://ai.meta.com/blog/llama-4-multimodal-intelligence/', 'snippet': 'Apr 5, 2025 ... Llama 4 models are designed with native multimodality, incorporating early fusion to seamlessly integrate text and vision tokens into a unified\xa0...'}, {'title': 'Unmatched Performance and Efficiency | Llama 4', 'link': 'https://www.llama.com/models/llama-4/', 'snippet': 'Latest models · Llama 4 Scout. Class-leading natively multimodal model that offers superior text and visual intelligence, single H100 GPU efficiency, and a 10M\xa0...'}, {'title': 'Model tree for meta-llama/Llama-4-Scout-17B-16E', 'link': 'https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E', 'snippet': 'Apr 5, 2025 ... "Llama 4" means the foundational large language models and software and algorithms, including machine-learning model code, trained model weights\xa0...'}, {'title': 'Llama: Industry Leading, Open-Source AI', 'link': 'https://ww

In [15]:
import newspaper

pages_content = []

for item in search_results:
    url = item.get("link")
    title = item.get("title", "")
    try:
        article = newspaper.Article(url)
        article.download()
        article.parse()
        if article.text:
            pages_content.append({
                "url": url,
                "title": title,
                "text": article.text
            })
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")

print(f"Fetched content from {len(pages_content)} pages")



Failed to fetch https://machine-learning-made-simple.medium.com/inside-llama-4-how-metas-new-open-source-ai-crushes-gpt-4o-and-gemini-e3265f914599: Article `download()` failed with Status code 429 for url None on URL https://machine-learning-made-simple.medium.com/inside-llama-4-how-metas-new-open-source-ai-crushes-gpt-4o-and-gemini-e3265f914599
Fetched content from 6 pages


In [16]:
# Build the index
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import VectorStoreIndex

documents = [
    Document(text=doc["text"], metadata={"title": doc["title"], "url": doc["url"]})
    for doc in pages_content
]

index = VectorStoreIndex.from_documents(
    documents,
    transformations=[SentenceSplitter(chunk_size=512, chunk_overlap=128)],
)

In [17]:
# Query
query_engine = index.as_query_engine()
response = query_engine.query(
    "How many parameters does LLaMA 4 have? List exact sizes of each variant."
)
print(response)

Llama 4 variants and their parameter sizes (activated / total where provided):

- Llama 4 Scout: 17 billion activated parameters (109 billion total parameters with 16 experts)  
- Llama 4 Maverick: 17 billion activated parameters (400 billion total parameters with 128 experts)

Additionally, earlier summary: Llama 4 series (2025) uses a mixture-of-experts architecture and the models are described as 17B parameter models with multiple experts.


In [18]:
# Show sources
for node in response.source_nodes:
    print(f"Title:  {node.metadata['title']}")
    print(f"Source: {node.metadata['url']}")
    print(f"Score:  {node.score:.4f}")
    print("-" * 40)

Title:  Llama (language model) - Wikipedia
Source: https://en.wikipedia.org/wiki/Llama_(language_model)
Score:  0.5311
----------------------------------------
Title:  Model tree for meta-llama/Llama-4-Scout-17B-16E
Source: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E
Score:  0.5287
----------------------------------------
