# Exercise 5: Web Automation & Vision Agent

Goal: Build a visual web browsing agent using Helium and LangGraph.

This agent will be able to navigate the web, see screenshots, and interact with elements.


In [None]:
%pip install databricks-langchain langgraph helium
%restart_python

In [None]:
import os
import time
from typing import Annotated, TypedDict

from databricks_langchain import ChatDatabricks
from helium import click, get_driver, go_to, start_chrome, write
from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
)
from langchain_core.tools import tool
from langgraph.graph import END, StateGraph
from langgraph.graph.message import add_messages

In [None]:
# Set HEADLESS=True for running in generalized environments
# On Databricks, this should likely be True unless you have a display forwarding setup
HEADLESS = os.getenv("HEADLESS", "True").lower() == "true"


# Setup simplified Helium Navigator
class HeliumNavigator:
    _instance = None

    def __init__(self):
        self._is_initialized = False

    @classmethod
    def get_instance(cls):
        if cls._instance is None:
            cls._instance = HeliumNavigator()
        return cls._instance

    def initialize(self):
        if not self._is_initialized:
            # We use headless=False so you can see what's happening!
            # Use headless=True in Codespaces (no display)
            start_chrome(headless=HEADLESS)
            self._is_initialized = True

    def get_screenshot_b64(self) -> str:
        self.initialize()
        driver = get_driver()

        # In HEADLESS mode, save the screenshot to disk so the user can see it
        # In a notebook, we could also display it inline, but saving to disk is safe
        if HEADLESS:
            os.makedirs("screenshots", exist_ok=True)
            timestamp = int(time.time())
            filename = f"screenshots/screenshot_{timestamp}.png"
            driver.save_screenshot(filename)
            print(f"\n[Headless Mode] Saved screenshot to {filename}")

        return driver.get_screenshot_as_base64()

    def navigate(self, url: str) -> str:
        self.initialize()
        try:
            go_to(url)
            time.sleep(2)
            return f"Navigated to {url}."
        except Exception as e:
            return f"Error navigating to {url}: {e}"

    def click_element(self, target: str) -> str:
        self.initialize()
        try:
            click(target)
            time.sleep(1)
            return f"Clicked '{target}'."
        except Exception as e:
            return f"Error clicking '{target}': {e}"

    def type_text(self, text: str) -> str:
        self.initialize()
        try:
            write(text)
            time.sleep(1)
            return f"Typed '{text}'."
        except Exception as e:
            return f"Error typing '{text}': {e}"


navigator = HeliumNavigator.get_instance()

In [None]:
# Exercise 5.1: Define Tools
# We will wrap the navigator methods into tools.
# Note: get_screenshot returns the base64 string directly for the agent to "see".


@tool
def navigate(url: str) -> str:
    """Navigate the browser to a specific URL."""
    return navigator.navigate(url)


@tool
def click_element(target: str) -> str:
    """Click an element. 'target' can be the visible text on the button/link."""
    return navigator.click_element(target)


@tool
def type_text(text: str) -> str:
    """Type text into the focused element."""
    return navigator.type_text(text)


# <solution>
@tool
def get_screenshot() -> str:
    """Get the current page screenshot as a base64 string."""
    return navigator.get_screenshot_b64()


# </solution>


# Exercise 5.5: Handling Loop Termination (Didactic)
#
# Logic: The agent loop continues as long as the model calls a tool.
# If the model emits plain text, the loop terminates (see `should_continue` below).
#
# PROBLEM: The model might see the screenshot and just "think" or "describe" what it sees
# in plain text without calling a navigation tool. This causes the loop to end prematurely
# before the task is actually done.

# <solution>
@tool
def think(thought: str) -> str:
    """Use this tool to think, plan, or analyze data/screenshots.
    Do NOT use this tool to communicate with the user.
    """
    return thought


# </solution>


tools = [navigate, click_element, type_text, get_screenshot, think]
tools_by_name = {tool.name: tool for tool in tools}

model = ChatDatabricks(endpoint="databricks-claude-sonnet-4-5")
model_with_tools = model.bind_tools(tools)

In [None]:
# State Definition
class VisualState(TypedDict):
    # We store the conversation history
    messages: Annotated[list[BaseMessage], add_messages]


# Exercise 5.2: Define Custom Tool Node
# This node executes tools. If the tool is `get_screenshot`, it handles the base64 output
# by creating a multimodal ToolMessage.


def tool_node(state: VisualState):
    messages = state["messages"]
    last_message = messages[-1]
    results = []

    for tool_call in last_message.tool_calls:
        tool_name = tool_call["name"]
        tool_args = tool_call["args"]
        call_id = tool_call["id"]

        selected_tool = tools_by_name[tool_name]

        # Invoke the tool
        output = selected_tool.invoke(tool_args)

        # <solution>
        # Special handling for screenshot to make it visible to the model
        if tool_name == "get_screenshot":
            # output is the base64 string
            content = [
                {"type": "text", "text": "Here is the screenshot of the current page."},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{output}"},
                },
            ]
            results.append(ToolMessage(content=content, tool_call_id=call_id))
        else:
            # Standard text output
            results.append(ToolMessage(content=str(output), tool_call_id=call_id))
        # </solution>

    return {"messages": results}

In [None]:
# Agent Node
def agent_node(state: VisualState):
    # The model deals with the history of messages which now includes images
    messages = state["messages"]
    # We add a system prompt to ensure the model knows it can see
    if not isinstance(messages[0], SystemMessage):
        system = SystemMessage(
            "You are a visual web browsing agent. "
            "For any internal analysis, planning, or screenshot interpretation, you MUST use the `think` tool. "
            "Only output plain text when you are providing the FINAL ANSWER to the user. "
            "When you receive a screenshot, verify strictly if you have the answer. "
            "Do not ask if you should describe it; just do the analysis with the `think` tool."
        )
        messages = [system] + messages

    response = model_with_tools.invoke(messages)
    return {"messages": [response]}


def should_continue(state: VisualState):
    last_message = state["messages"][-1]
    if last_message.tool_calls:
        return "tools"
    return END

In [None]:
# Exercise 5.3: Build the Graph
# Define the nodes and edges for the ReAct architecture.

# <solution>
workflow = StateGraph(VisualState)
workflow.add_node("agent", agent_node)
workflow.add_node("tools", tool_node)

workflow.set_entry_point("agent")
workflow.add_conditional_edges("agent", should_continue, {"tools": "tools", END: END})
workflow.add_edge("tools", "agent")

app = workflow.compile()
# </solution>

In [None]:
# Exercise 5.4: Run the Agent
# Instruct the agent to perform a visual task.

# query = "Go to https://www.daytonartinstitute.org/exhibits/janet-fish/ and look at the painting 'Embroidery from Uzbekistan'. List what fruits are depicted in it. Make sure to scroll down to have the complete image captured"
query = "I'm trying to find how hard I have to work to get a repo in github.com/trending. Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?"
print(f"User: {query}")

# <solution>
# Note: We stream to see the steps
inputs = {"messages": [HumanMessage(content=query)]}
for event in app.stream(inputs, stream_mode="values", config={"recursion_limit": 50}):
    message = event["messages"][-1]
    if isinstance(message, AIMessage):
        print(f"AI: {message.content}")
        if message.tool_calls:
            print(f"   Tools: {message.tool_calls}")
    elif isinstance(message, ToolMessage):
        print(f"Tool ({message.name}): [Output received]")
# </solution>