<a href="https://colab.research.google.com/github/trdykhuong/gemini_api/blob/main/examples/Browser_as_a_tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up the SDK

In [2]:
%pip install -U -q 'google-genai'

from google import genai
genai.__version__

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m237.3/237.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

'1.44.0'

Set up your API key

In [3]:
import os
from google.colab import userdata

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

Create the SDK client

In [4]:
from google.genai import types

client = genai.Client(api_key=GOOGLE_API_KEY)

LIVE_MODEL = 'gemini-2.0-flash-live-001'  # @param ['gemini-2.0-flash-live-001', 'gemini-live-2.5-flash-preview', 'gemini-2.5-flash-preview-native-audio-dialog', 'gemini-2.5-flash-exp-native-audio-thinking-dialog'] {allow-input: true, isTemplate: true}
MODEL = 'gemini-2.5-flash'  # @param ['gemini-2.5-flash'] {allow-input: true, isTemplate: true}

Define some helpers

In [5]:
import json
from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse

from IPython.display import display, HTML, Markdown


def show_parts(r: types.GenerateContentResponse) -> None:
  """Helper for rendering a GenerateContentResponse object in IPython."""
  parts = r.candidates[0].content.parts
  if parts is None:
    finish_reason = r.candidates[0].finish_reason
    print(f'{finish_reason=}')
    return

  for part in parts:
    if part.text:
      display(Markdown(part.text))
    elif part.executable_code:
      display(Markdown(f'```python\n{part.executable_code.code}\n```'))
    else:
      print(json.dumps(part.model_dump(exclude_none=True), indent=2))

  grounding_metadata = r.candidates[0].grounding_metadata
  if grounding_metadata and grounding_metadata.search_entry_point:
    display(HTML(grounding_metadata.search_entry_point.rendered_content))


def can_crawl_url(url: str, user_agent: str = "*") -> bool:
    """Look up robots.txt for a URL and determine if crawling is permissable.

    Args:
        url: The full URL to check.
        user_agent: The user agent to check, defaults to any UA.

    Returns:
        True if the URL can be crawled, False otherwise.
    """
    try:
      parsed_url = urlparse(url)
      robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
      rp = RobotFileParser(robots_url)
      rp.read()

      return rp.can_fetch(user_agent, url)

    except Exception as e:
      print(f"Error checking robots.txt: {e}")
      return False  # Be a good citizen: fail closed.

# Browsing live


Use Google Search as a tool


In [6]:
config = {
    'response_modalities': ['TEXT'],
    'tools': [
        {'google_search': {}},
    ],
}


async def stream_response(stream, *, tool=None):
  """Handle a live streamed response, printing out text and issue tool calls."""
  all_responses = []

  async for msg in stream.receive():
    all_responses.append(msg)

    if text := msg.text:
      # Print streamed text responses.
      print(text, end='')

    elif tool_call := msg.tool_call:
      # Handle tool calls.
      for fc in tool_call.function_calls:
        print(f'< Tool call', fc.model_dump(exclude_none=True))

        if tool:
          # Call the tool.
          assert fc.name == tool.__name__, "Unknown tool call encountered"
          tool_result = tool(**fc.args)

        else:
          # Return 'ok' as a way to mock tool calls.
          tool_result = 'ok'

        tool_response = types.LiveClientToolResponse(
            function_responses=[types.FunctionResponse(
                name=fc.name,
                id=fc.id,
                response={'result': tool_result},
            )]
        )

        await stream.send(input=tool_response)

  return all_responses

In [7]:
async def run():
  async with client.aio.live.connect(model=LIVE_MODEL, config=config) as stream:

    await stream.send(input="Bài báo nổi bật trên Báo Tuổi Trẻ hôm nay là gì?", end_of_turn=True)
    await stream_response(stream)

await run()

Please use one of the more specific methods: `send_client_content`, `send_realtime_input`, or `send_tool_response` instead.
  await stream.send(input="Bài báo nổi bật trên Báo Tuổi Trẻ hôm nay là gì?", end_of_turn=True)


Dựa trên kết quả tìm kiếm từ Báo Tuổi Trẻ hôm nay, ngày 15 tháng 10 năm 2025, một số tin nổi bật bao gồm:

*   **Đường sắt tốc độ cao nối Cần Giờ của Vinspeed:** TP.HCM lập hội đồng thẩm định dự án.
*   **Giá vàng thế giới:** Vượt 4.200 USD/ounce, vàng miếng SJC lên 148 triệu đồng/lượng.
*   **Campuchia công bố danh tính** nghi phạm người Trung Quốc tra tấn, sát hại nam sinh Hàn Quốc.
*   **Hàn Quốc cấm công dân** đến 3 "điểm đen" ở Campuchia.
*   **Hà Nội mở đợt thi đua cao điểm** chào mừng Đại hội lần thứ XIV của Đảng.
*   **TP Hồ Chí Minh bắt tay hành động** vì mục tiêu chung.



Add a live browser

In [8]:
%pip install -q markdownify

In [9]:
import requests

import markdownify


def load_page(url: str) -> str:
  """
  Load the page contents as Markdown.
  """

  if not can_crawl_url(url):
    return f"URL {url} failed a robots.txt check."

  try:
    page = requests.get(url)
    return markdownify.markdownify(page.content)

  except Exception as e:
    return f"Error accessing URL: {e}"

In [10]:
load_page_def = types.Tool(functionDeclarations=[
    types.FunctionDeclaration.from_callable(client=client, callable=load_page)]).model_dump(exclude_none=True)

config = {
    'response_modalities': ['TEXT'],
    'tools': [
        load_page_def,
    ],
    'system_instruction': """Your job is to answer the users query using the tools available.

First determine the address that will have the information and tell the user. Then immediately
invoke the tool. Then answer the user.
"""
}


async def run():
  async with client.aio.live.connect(model=LIVE_MODEL, config=config) as stream:

    await stream.send(input="What is today's featured article on the English Wikipedia?", end_of_turn=True)
    await stream_response(stream, tool=load_page)

await run()

Please use one of the more specific methods: `send_client_content`, `send_realtime_input`, or `send_tool_response` instead.
  await stream.send(input="What is today's featured article on the English Wikipedia?", end_of_turn=True)


I can find the current featured article on the English Wikipedia by accessing its main page. The URL for that is "https://en.wikipedia.org/wiki/Main_Page".
< Tool call {'id': 'function-call-16543207123890117898', 'args': {'url': 'https://en.wikipedia.org/wiki/Main_Page'}, 'name': 'load_page'}


Please use one of the more specific methods: `send_client_content`, `send_realtime_input`, or `send_tool_response` instead.
  await stream.send(input=tool_response)


I am sorry, I was unable to retrieve the content from the Wikipedia main page due to a robots.txt check failure. I cannot directly answer your question.


Browse pages visually


In [None]:
!apt install -y chromium-browser

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  apparmor libfuse3-3 libudev1 snapd squashfs-tools systemd-hwe-hwdb udev
Suggested packages:
  apparmor-profiles-extra apparmor-utils fuse3 zenity | kdialog
The following NEW packages will be installed:
  apparmor chromium-browser libfuse3-3 snapd squashfs-tools systemd-hwe-hwdb
  udev
The following packages will be upgraded:
  libudev1
1 upgraded, 7 newly installed, 0 to remove and 37 not upgraded.
Need to get 34.1 MB of archives.
After this operation, 134 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 apparmor amd64 3.0.4-2ubuntu2.4 [598 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 squashfs-tools amd64 1:4.5-3build1 [159 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libudev1 amd64 249.11-0ubuntu3.17 [76.7 kB]
Get:4 http://archive.ubuntu.com/

In [None]:
%pip install -q selenium webdriver-manager

Define a graphical browser

In [None]:
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

SCREENSHOT_FILE = 'screenshot.png'


def browse_url(url: str) -> str:
    """Captures a screenshot of the webpage at the provided URL.

    A graphical browser will be used to connect to the URL provided,
    and generate a screenshot of the rendered web page.

    Args:
        url: The full absolute URL to browse/screenshot.

    Returns:
        "ok" if successfully captured, or any error messages.
    """
    if not can_crawl_url(url):
      return f"URL {url} failed a robots.txt check."

    try:
      chrome_options = webdriver.ChromeOptions()
      chrome_options.add_argument('--headless')
      chrome_options.add_argument('--no-sandbox')
      chrome_options.headless = True
      driver = webdriver.Chrome(options=chrome_options)

      # Take one large image, 2x high as it is wide. This should be enough to
      # capture most of a page's interesting info, and should capture anything
      # designed "above the fold", without going too deep into things like
      # footer links, infinitely scrolling pages, etc.
      # Otherwise multiple images are needed, which requires waiting, scrolling
      # and stitching, and introduces lag that slows down interactions.
      driver.set_window_size(1024, 2048)
      driver.get(url)

      # Wait for the page to fully load.
      time.sleep(5)
      driver.save_screenshot(SCREENSHOT_FILE)

      print(f"Screenshot saved to {SCREENSHOT_FILE}")
      return markdownify.markdownify(driver.page_source)

    except Exception as e:
      print(f"An error occurred: {e}")
      return str(e)

    finally:
      # Close the browser
      if driver:
        driver.quit()


url = "https://en.wikipedia.org/wiki/Castle"
browse_url(url);

In [None]:
from IPython.display import Image

Image('screenshot.png')

Connect the browser to the model

In [None]:
sys_int = """You are a system with access to websites via the `browse_url` tool.
Use the `browse_url` tool to browse a URL and generate a screenshot that will be
returned for you to see and inspect, like using a web browser.

When a user requests information, first use your knowledge to determine a specific
page URL, tell the user the URL and then invoke the `browse_tool` with this URL. The
tool will supply the website, at which point you will examine the contents of the
screenshot to answer the user's questions. Do not ask the user to proceed, just act.

You will not be able to inspect the page HTML, so determine the most specific page
URL, rather than starting navigation from a site's homepage.
"""

# Because `browse_url` generates an image, and images can't be used in function calling
# (but can be used in regular Content/Parts), automatic function calling can't be used and
# the tool must be specified explicitly, and handled manually.
browse_tool = types.Tool(functionDeclarations=[
    types.FunctionDeclaration.from_callable(client=client, callable=browse_url)])

chat = client.chats.create(
    model=MODEL,
    config={'tools': [browse_tool], 'system_instruction': sys_int})

r = chat.send_message('Search the key word "trending" on Youtube?')
show_parts(r)

In [None]:
import PIL

response_parts = []

# For each function call, generate the response in two parts. Once for the
# function response, and one for the image as regular content. This simulates
# the function "returning" an image to the model as part of a function call.
for p in r.candidates[0].content.parts:
  if fn := p.function_call:
    assert fn.name == 'browse_url'

    url = fn.args['url']
    print(url)
    response = browse_url(url)
    print(response)

    img = PIL.Image.open(SCREENSHOT_FILE)

    fr = genai.types.Part(function_response=genai.types.FunctionResponse(
        name=fn.name,
        id=fn.id,
        response={'result': response},
    ))
    response_parts.extend([fr, img])

In [None]:
Image(SCREENSHOT_FILE)

In [None]:
r2 = chat.send_message(response_parts)
show_parts(r2)

# Browse local services

In [None]:
!wget -nv https://storage.googleapis.com/generativeai-downloads/data/intranet.zip
!unzip intranet.zip

In [None]:
import http.server
import os
import socketserver
import threading


PORT = 80
DIRECTORY = "./intranet/"

class Handler(http.server.SimpleHTTPRequestHandler):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, directory=DIRECTORY, **kwargs)


httpd = socketserver.TCPServer(("", PORT), Handler)
server_thread = threading.Thread(target=httpd.serve_forever)
server_thread.start()

In [None]:
!echo "127.0.0.1 papercorp" >> /etc/hosts
!curl http://papercorp:{PORT}/

In [None]:
import PIL

print(browse_url(f"http://papercorp:{PORT}/"))
PIL.Image.open(SCREENSHOT_FILE)

In [None]:
config = {
    'system_instruction': """Use the tools you have to answer the user's questions about
the "PaperCorp" company.

You have access to web pages through the `load_page` tool, including access to the local
network and intranet, where you will find information about the company. The `load_page`
tool will return you the page contents as Markdown.

The intranet hostname is `papercorp`, and the home page can be accessed via
http://papercorp/.

Unless you know an address already, start navigating from the home page to find other
pages.
""",
    'tools': [load_page],
}

chat = client.chats.create(model=MODEL, config=config)
r = chat.send_message('What forms are available through HR?')
show_parts(r)

In [None]:
httpd.shutdown()
httpd.server_close()