https://huggingface.co/docs/smolagents/main/en/examples/web_browser#web-browser-automation-with-agents-

In [1]:
# Installation
! pip install smolagents
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/smolagents.git

Collecting smolagents
  Downloading smolagents-1.21.1-py3-none-any.whl.metadata (16 kB)
Collecting python-dotenv (from smolagents)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading smolagents-1.21.1-py3-none-any.whl (145 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.4/145.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, smolagents
Successfully installed python-dotenv-1.1.1 smolagents-1.21.1


# Web Browser Automation with Agents 🤖🌐

In this notebook, we'll create an **agent-powered web browser automation system**! This system can navigate websites, interact with elements, and extract information automatically.

The agent will be able to:

- [x] Navigate to web pages
- [x] Click on elements
- [x] Search within pages
- [x] Handle popups and modals
- [x] Extract information

Let's set up this system step by step!

First, run these lines to install the required dependencies:

```bash
pip install smolagents selenium helium pillow -q
```

Let's import our required libraries and set up environment variables:

In [2]:
!pip install smolagents selenium helium pillow -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m101.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for helium (setup.py) ... [?25l[?25hdone


In [3]:
from io import BytesIO
from time import sleep

import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

from smolagents import CodeAgent, tool
from smolagents.agents import ActionStep

# Load environment variables
load_dotenv()

False

Now let's create our core browser interaction tools that will allow our agent to navigate and interact with web pages:

In [4]:
@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """
    Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
    Args:
        text: The text to search for
        nth_result: Which occurrence to jump to (default: 1)
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if nth_result > len(elements):
        raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
    result = f"Found {len(elements)} matches for '{text}'."
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView(true);", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> None:
    """Goes back to previous page."""
    driver.back()

@tool
def close_popups() -> str:
    """
    Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows!
    This does not work on cookie consent banners.
    """
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()

Let's set up our browser with Chrome and configure screenshot capabilities:

In [9]:
# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
# Add argument to use a temporary user data directory
chrome_options.add_argument("--user-data-dir=/tmp/chrome_user_data")

# Initialize the browser in headless mode
driver = helium.start_chrome(headless=True, options=chrome_options)

# Set up screenshot callback
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)  # Let JavaScript animations happen before taking the screenshot
    driver = helium.get_driver()
    current_step = memory_step.step_number
    if driver is not None:
        for previous_memory_step in agent.memory.steps:  # Remove previous screenshots for lean processing
            if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
                previous_memory_step.observations_images = None
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]  # Create a copy to ensure it persists

    # Update observations with current URL
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
    )

SessionNotCreatedException: Message: session not created: probably user data directory is already in use, please specify a unique value for --user-data-dir argument, or don't use --user-data-dir; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#sessionnotcreatedexception
Stacktrace:
#0 0x5d50693b301a <unknown>
#1 0x5d5068e52a70 <unknown>
#2 0x5d5068e8de07 <unknown>
#3 0x5d5068e885a7 <unknown>
#4 0x5d5068ed893e <unknown>
#5 0x5d5068ed7f06 <unknown>
#6 0x5d5068eca1b3 <unknown>
#7 0x5d5068e9659b <unknown>
#8 0x5d5068e97971 <unknown>
#9 0x5d50693781eb <unknown>
#10 0x5d506937bf39 <unknown>
#11 0x5d506935f2c9 <unknown>
#12 0x5d506937cae8 <unknown>
#13 0x5d5069343baf <unknown>
#14 0x5d50693a00a8 <unknown>
#15 0x5d50693a0286 <unknown>
#16 0x5d50693b1ff6 <unknown>
#17 0x7d1d3aed5ac3 <unknown>


In [10]:
import helium
from selenium import webdriver
from smolagents import CodeAgent
from smolagents.steps import ActionStep
from PIL import Image
from io import BytesIO
from time import sleep
import tempfile  # <-- 1. Import the tempfile library

# --- CORRECTED SECTION ---
# 2. Create a unique temporary directory for the user data
user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
# 3. Use the newly created unique directory path
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

# Initialize the browser in headless mode
# The rest of your code remains the same.
driver = helium.start_chrome(headless=True, options=chrome_options)

# Set up screenshot callback
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)  # Let JavaScript animations happen before taking the screenshot
    driver = helium.get_driver()
    current_step = memory_step.step_number
    if driver is not None:
        for previous_memory_step in agent.memory.steps:  # Remove previous screenshots for lean processing
            if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
                previous_memory_step.observations_images = None
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]  # Create a copy to ensure it persists

    # Update observations with current URL
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
    )

# It's also good practice to ensure the browser is closed at the end
# to prevent this issue from happening again.
# For example, in a try...finally block or at the end of your script:
#
# try:
#   # ... your agent code here ...
# finally:
#   helium.kill_browser()

ModuleNotFoundError: No module named 'smolagents.steps'

In [12]:
import helium
from selenium import webdriver
from smolagents import CodeAgent, ActionStep # <-- CORRECTED THIS LINE
from PIL import Image
from io import BytesIO
from time import sleep
import tempfile

# Create a unique temporary directory for the user data
user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

# Initialize the browser in headless mode
driver = helium.start_chrome(headless=True, options=chrome_options)

# Set up screenshot callback
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)  # Let JavaScript animations happen before taking the screenshot
    driver = helium.get_driver()
    current_step = memory_step.step_number
    if driver is not None:
        for previous_memory_step in agent.memory.steps:  # Remove previous screenshots for lean processing
            if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
                previous_memory_step.observations_images = None
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]  # Create a copy to ensure it persists

    # Update observations with current URL
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
    )

# Now you can continue with the rest of your agent setup and execution.

Using temporary user data directory: /tmp/tmpbrewy5a4


SessionNotCreatedException: Message: session not created: probably user data directory is already in use, please specify a unique value for --user-data-dir argument, or don't use --user-data-dir; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#sessionnotcreatedexception
Stacktrace:
#0 0x58e3b061301a <unknown>
#1 0x58e3b00b2a70 <unknown>
#2 0x58e3b00ede07 <unknown>
#3 0x58e3b00e85a7 <unknown>
#4 0x58e3b013893e <unknown>
#5 0x58e3b0137f06 <unknown>
#6 0x58e3b012a1b3 <unknown>
#7 0x58e3b00f659b <unknown>
#8 0x58e3b00f7971 <unknown>
#9 0x58e3b05d81eb <unknown>
#10 0x58e3b05dbf39 <unknown>
#11 0x58e3b05bf2c9 <unknown>
#12 0x58e3b05dcae8 <unknown>
#13 0x58e3b05a3baf <unknown>
#14 0x58e3b06000a8 <unknown>
#15 0x58e3b0600286 <unknown>
#16 0x58e3b0611ff6 <unknown>
#17 0x7f8b67a8aac3 <unknown>


In [13]:
import helium
from selenium import webdriver
from smolagents import CodeAgent, ActionStep
from PIL import Image
from io import BytesIO
from time import sleep
import tempfile
import atexit # For robust cleanup

# Create a unique temporary directory for the user data
user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()

# --- CRITICAL ADDITIONS FOR STABILITY IN LINUX/CONTAINER ENVIRONMENTS ---
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# --- END OF CRITICAL ADDITIONS ---

chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

# Initialize the browser
# We pass the options directly and let the '--headless' argument handle the mode.
driver = helium.start_chrome(options=chrome_options)

# It's good practice to ensure the browser is always killed when the script finishes or crashes.
atexit.register(helium.kill_browser)

print("Browser started successfully!")

# Set up screenshot callback
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)  # Let JavaScript animations happen before taking the screenshot
    driver = helium.get_driver()
    current_step = memory_step.step_number
    if driver is not None:
        for previous_memory_step in agent.memory.steps:  # Remove previous screenshots for lean processing
            if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
                previous_memory_step.observations_images = None
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]  # Create a copy to ensure it persists

    # Update observations with current URL
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
    )

# Now you can continue with the rest of your agent setup and execution.
# The `atexit` command will ensure helium.kill_browser() is called when your script ends.

Using temporary user data directory: /tmp/tmp612pi6d0
Browser started successfully!


Now let's create our web automation agent:

In [15]:
from smolagents import InferenceClientModel

# Initialize the model
model_id = "Qwen/Qwen2-VL-72B-Instruct"  # You can change this to your preferred VLM model
model = InferenceClientModel(model_id=model_id)

# Create the agent
agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=2,
    verbosity_level=2,
)

# Import helium for the agent
agent.python_executor("from helium import *") # Removed the extra argument

CodeOutput(output=None, logs='', is_final_answer=False)

The agent needs instructions on how to use Helium for web automation. Here are the instructions we'll provide:

In [16]:
helium_instructions = """
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
We've already ran "from helium import *"
Then you can go to pages!
Code:
go_to('github.com/trending')
```<end_code>

You can directly click clickable elements by inputting the text that appears on them.
Code:
click("Top products")
```<end_code>

If it's a link:
Code:
click(Link("Top products"))
```<end_code>

If you try to interact with an element and it's not found, you'll get a LookupError.
In general stop your action after each button click to see what happens on your screenshot.
Never try to login in a page.

To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
Code:
scroll_down(num_pixels=1200) # This will scroll one viewport down
```<end_code>

When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
Just use your built-in tool `close_popups` to close them:
Code:
close_popups()
```<end_code>

You can use .exists() to check for the existence of an element. For example:
Code:
if Text('Accept cookies?').exists():
    click('I accept')
```<end_code>
"""

Now we can run our agent with a task! Let's try finding information on Wikipedia:

In [17]:
search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

agent_output = agent.run(search_request + helium_instructions)
print("Final output:")
print(agent_output)

Captured a browser screenshot: (1000, 1158) pixels


Captured a browser screenshot: (1000, 1158) pixels


Captured a browser screenshot: (1000, 1158) pixels


Final output:
Code:
scroll_down(num_pixels=1200)
```<end_code>
Calling tools:
[{'id': 'call_3', 'type': 'function', 'function': {'name': 'python_interpreter', 'arguments': 'scroll_down(num_pixels=1200)'}}]


You can run different tasks by modifying the request. For example, here's for me to know if I should work harder:

In [18]:
github_request = """
I'm trying to find how hard I have to work to get a repo in github.com/trending.
Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
"""

agent_output = agent.run(github_request + helium_instructions)
print("Final output:")
print(agent_output)

Captured a browser screenshot: (1000, 1158) pixels


Captured a browser screenshot: (1000, 1158) pixels


Final output:
The page cannot be navigated to, defaulting to next query!


The system is particularly effective for tasks like:
- Data extraction from websites
- Web research automation
- UI testing and verification
- Content monitoring

### شغال Api

In [20]:
from io import BytesIO
from time import sleep

import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

from smolagents import CodeAgent, tool
from smolagents.agents import ActionStep

# Load environment variables
load_dotenv()
@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """
    Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
    Args:
        text: The text to search for
        nth_result: Which occurrence to jump to (default: 1)
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if nth_result > len(elements):
        raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
    result = f"Found {len(elements)} matches for '{text}'."
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView(true);", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> None:
    """Goes back to previous page."""
    driver.back()

@tool
def close_popups() -> str:
    """
    Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows!
    This does not work on cookie consent banners.
    """
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
import helium
from selenium import webdriver
from smolagents import CodeAgent, ActionStep
from PIL import Image
from io import BytesIO
from time import sleep
import tempfile
import atexit # For robust cleanup

# Create a unique temporary directory for the user data
user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()

# --- CRITICAL ADDITIONS FOR STABILITY IN LINUX/CONTAINER ENVIRONMENTS ---
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# --- END OF CRITICAL ADDITIONS ---

chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

# Initialize the browser
# We pass the options directly and let the '--headless' argument handle the mode.
driver = helium.start_chrome(options=chrome_options)

# It's good practice to ensure the browser is always killed when the script finishes or crashes.
atexit.register(helium.kill_browser)

print("Browser started successfully!")

# Set up screenshot callback
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)  # Let JavaScript animations happen before taking the screenshot
    driver = helium.get_driver()
    current_step = memory_step.step_number
    if driver is not None:
        for previous_memory_step in agent.memory.steps:  # Remove previous screenshots for lean processing
            if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
                previous_memory_step.observations_images = None
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]  # Create a copy to ensure it persists

    # Update observations with current URL
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
    )

# Now you can continue with the rest of your agent setup and execution.
# The `atexit` command will ensure helium.kill_browser() is called when your script ends.
from smolagents import InferenceClientModel

# Initialize the model
model_id = "Qwen/Qwen2-VL-72B-Instruct"  # You can change this to your preferred VLM model
model = InferenceClientModel(model_id=model_id)

# Create the agent
agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=2,
    verbosity_level=2,
)

# Import helium for the agent
agent.python_executor("from helium import *") # Removed the extra argument
helium_instructions = """
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
We've already ran "from helium import *"
Then you can go to pages!
Code:
go_to('github.com/trending')
```<end_code>

You can directly click clickable elements by inputting the text that appears on them.
Code:
click("Top products")
```<end_code>

If it's a link:
Code:
click(Link("Top products"))
```<end_code>

If you try to interact with an element and it's not found, you'll get a LookupError.
In general stop your action after each button click to see what happens on your screenshot.
Never try to login in a page.

To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
Code:
scroll_down(num_pixels=1200) # This will scroll one viewport down
```<end_code>

When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
Just use your built-in tool `close_popups` to close them:
Code:
close_popups()
```<end_code>

You can use .exists() to check for the existence of an element. For example:
Code:
if Text('Accept cookies?').exists():
    click('I accept')
```<end_code>
"""
search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

agent_output = agent.run(search_request + helium_instructions)
print("Final output:")
print(agent_output)
github_request = """
I'm trying to find how hard I have to work to get a repo in github.com/trending.
Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
"""

agent_output = agent.run(github_request + helium_instructions)
print("Final output:")
print(agent_output)

Using temporary user data directory: /tmp/tmp2hynfhh1
Browser started successfully!


Captured a browser screenshot: (1000, 1158) pixels


Captured a browser screenshot: (1000, 1158) pixels


Captured a browser screenshot: (1000, 1158) pixels


Final output:
Thought: Since my last attempt to find a sentence containing both "1992" and "construction accident" was not successful due to an error, I will now go to the Wikipedia page for Chicago and manually search for such a sentence.
Action: go_to('https://en.wikipedia.org/wiki/Chicago')
Code:
go_to('https://en.wikipedia.org/wiki/Chicago')
print(screenshot())
```</code>


Captured a browser screenshot: (1000, 1158) pixels


Captured a browser screenshot: (1000, 1158) pixels


Captured a browser screenshot: (1000, 1158) pixels


Final output:
Thought: As I can't use the Wikipedia API to retrieve the trending repos from GitHub, I'll have to use the helium tool to navigate to GitHub's trending page manually. I'll first click on the "Top products" then parse the HTML of the page using BeautifulSoup or the built-in HTML parser of helium to find the top author of the top trending repo. From the author's profile, I should also be able to find the total number of their commits over the last year.

<code>
# Navigate to Github trending page
go_to("https://github.com/trending")

# Handle any popup
if Text("Dismiss").exists():
    click(Link("Dismiss"))

# Open the top trending repo
click(Link("GitHub trending - Enter search keywords"))

# E
</code>


In [1]:
#!pip install smolagents "transformers>=4.41.0" torch accelerate bitsandbytes Pillow python-dotenv helium selenium

# -*- coding: utf-8 -*-
from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

from smolagents import CodeAgent, tool, ActionStep, TransformersModel

# تحميل متغيرات البيئة (إذا كنت تستخدم ملف .env)
load_dotenv()

# --- تعريف الأدوات التي سيستخدمها الوكيل ---

@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """
    Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
    Args:
        text: The text to search for
        nth_result: Which occurrence to jump to (default: 1)
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements:
        return f"No matches found for '{text}'."
    if nth_result > len(elements):
        return f"Match n°{nth_result} not found (only {len(elements)} matches found)"

    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """
    Closes any visible modal or pop-up on the page by sending the ESCAPE key.
    """
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

# --- إعداد متصفح الويب (Selenium + Helium) ---

# إنشاء مجلد بيانات مستخدم مؤقت وفريد لكل جلسة
user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

# إعداد خيارات Chrome للتشغيل في بيئات مثل Colab
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

# تهيئة المتصفح
driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser) # ضمان إغلاق المتصفح عند انتهاء البرنامج
print("Browser started successfully!")

# --- دالة رد الاتصال (Callback) لأخذ لقطات الشاشة ---

def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)  # انتظر قليلاً حتى تكتمل الرسوم المتحركة
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: (image.size) pixels")

        # Add the screenshot to the memory step observations
        memory_step.observations_images = [image.copy()]


    # Update observations with current URL
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
    )

# --- القسم المعدل: تهيئة النموذج المحلي ---

# سنستخدم نموذج Phi-3 Vision الصغير، وهو مناسب للتجربة على وحدة المعالجة المركزية
# سيتم تنزيله وتخزينه مؤقتًا في المرة الأولى.
print("Loading local model... This may take a while.")
model_id = "microsoft/phi-3-vision-128k-instruct"

# model_kwargs لتمرير وسائط إضافية إلى Hugging Face transformers
# trust_remote_code=True: ضروري لتحميل النماذج التي تحتوي على كود مخصص مثل نماذج الرؤية
model = TransformersModel(
    model_id=model_id,
    torch_dtype=torch.bfloat16, # Pass torch_dtype directly
    model_kwargs={
        "trust_remote_code": True,
    }
)
print("Model loaded successfully.")

# --- تهيئة الوكيل ---

agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=8, # تمت زيادة عدد الخطوات لإعطاء الوكيل فرصة أكبر
    verbosity_level=2,
)

# --- تعليمات الوكيل ---

# استيراد helium في بيئة تنفيذ الوكيل
agent.python_executor("from helium import *")

helium_instructions_lines = [
    "You can use helium to access websites. Don't bother about the helium driver, it's already managed.",
    'We\'ve already ran "from helium import *"',
    "Then you can go to pages!",
    "Code:",
    "go_to('github.com/trending')",
    "```<end_code>",
    "",
    "You can directly click clickable elements by inputting the text that appears on them.",
    "Code:",
    "click(\"Top products\")",
    "```<end_code>",
    "",
    "If it's a link:",
    "Code:",
    "click(Link(\"Top products\"))",
    "```<end_code>",
    "",
    "If you try to interact with an element and it's not found, you'll get a LookupError.",
    "In general stop your action after each button click to see what happens on your screenshot.",
    "Never try to login in a page.",
    "",
    "To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.",
    "Code:",
    "scroll_down(num_pixels=1200) # This will scroll one viewport down",
    "```<end_code>",
    "",
    "When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).",
    "Just use your built-in tool `close_popups` to close them:",
    "Code:",
    "close_popups()",
    "```<end_code>",
    "",
    "You can use .exists() to check for the existence of an element. For example:",
    "Code:",
    "if Text('Accept cookies?').exists():",
    "    click('I accept')",
    "```<end_code>"
]

helium_instructions = "\n".join(helium_instructions_lines)

Using temporary user data directory: /tmp/tmpxvpwwqmc
Browser started successfully!
Loading local model... This may take a while.




ValueError: Failed to load tokenizer and model for model_id='microsoft/phi-3-vision-128k-instruct': transformers.models.auto.auto_factory._BaseAutoModelClass.from_pretrained() got multiple values for keyword argument 'trust_remote_code'

In [2]:
# -*- coding: utf-8 -*-
from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

from smolagents import CodeAgent, tool, ActionStep, TransformersModel

# تحميل متغيرات البيئة (إذا كنت تستخدم ملف .env)
load_dotenv()

# --- تعريف الأدوات التي سيستخدمها الوكيل ---

@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """
    Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
    Args:
        text: The text to search for
        nth_result: Which occurrence to jump to (default: 1)
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements:
        return f"No matches found for '{text}'."
    if nth_result > len(elements):
        return f"Match n°{nth_result} not found (only {len(elements)} matches found)"

    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """
    Closes any visible modal or pop-up on the page by sending the ESCAPE key.
    """
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

# --- إعداد متصفح الويب (Selenium + Helium) ---

user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

# --- دالة رد الاتصال (Callback) لأخذ لقطات الشاشة ---

def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]

    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
    )

# --- القسم المصحح: تهيئة النموذج المحلي ---

print("Loading local model... This may take a while.")
model_id = "microsoft/phi-3-vision-128k-instruct"

# تم نقل `trust_remote_code=True` كمعامل مباشر خارج `model_kwargs`
model = TransformersModel(
    model_id=model_id,
    trust_remote_code=True,  # <-- الحل: تم نقله هنا
    model_kwargs={
        "torch_dtype": "auto", # استخدام 'auto' أكثر مرونة
        "_attn_implementation": "eager"
    }
)
print("Model loaded successfully.")

# --- تهيئة الوكيل ---

agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=8,
    verbosity_level=2,
)

# --- تعليمات الوكيل ---

agent.python_executor("from helium import *")

helium_instructions = """
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
We've already ran "from helium import *"
Then you can go to pages!
Code:
go_to('github.com/trending')
```<end_code>

You can directly click clickable elements by inputting the text that appears on them.
Code:
click("Top products")
```<end_code>

If it's a link:
Code:
click(Link("Top products"))
```<end_code>

To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
Code:
scroll_down(num_pixels=1200) # This will scroll one viewport down
```<end_code>

When you have pop-ups, use your built-in tool `close_popups`.

You can use .exists() to check for the existence of an element. For example:
Code:
if Text('Accept cookies?').exists():
    click('I accept')
```<end_code>
"""
# --- تشغيل الوكيل للمهمة الأولى ---

search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

print("\n--- Starting Wikipedia Task ---")
agent_output = agent.run(search_request + helium_instructions)
print("\n--- Final output for Wikipedia Task: ---")
print(agent_output)

# إغلاق المتصفح في النهاية
helium.kill_browser()

Using temporary user data directory: /tmp/tmpgh59vqkb




Browser started successfully!
Loading local model... This may take a while.


ValueError: Failed to load tokenizer and model for model_id='microsoft/phi-3-vision-128k-instruct': transformers.models.auto.auto_factory._BaseAutoModelClass.from_pretrained() got multiple values for keyword argument 'torch_dtype'

In [4]:
# -*- coding: utf-8 -*-
from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

from smolagents import CodeAgent, tool, ActionStep, TransformersModel

# تحميل متغيرات البيئة (إذا كنت تستخدم ملف .env)
load_dotenv()

# --- تعريف الأدوات التي سيستخدمها الوكيل ---

@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """
    Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
    Args:
        text: The text to search for
        nth_result: Which occurrence to jump to (default: 1)
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements:
        return f"No matches found for '{text}'."
    if nth_result > len(elements):
        return f"Match n°{nth_result} not found (only {len(elements)} matches found)"

    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """
    Closes any visible modal or pop-up on the page by sending the ESCAPE key.
    """
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

# --- إعداد متصفح الويب (Selenium + Helium) ---

user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

# --- دالة رد الاتصال (Callback) لأخذ لقطات الشاشة ---

def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]

    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
    )

# --- القسم المصحح: تهيئة النموذج المحلي ---

print("Loading local model... This may take a while.")
model_id = "microsoft/phi-3-vision-128k-instruct"

# تم نقل `torch_dtype="auto"` كمعامل مباشر خارج `model_kwargs`
model = TransformersModel(
    model_id=model_id,
    trust_remote_code=True,
    torch_dtype="auto",  # <-- الحل: تم نقله هنا
    model_kwargs={
        # الوسائط الأقل شيوعًا أو الخاصة تبقى هنا
        "_attn_implementation": "eager"
    }
)
print("Model loaded successfully.")


# --- تهيئة الوكيل ---

agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=8,
    verbosity_level=2,
)

# --- تعليمات الوكيل ---

agent.python_executor("from helium import *")

helium_instructions = """
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
We've already ran "from helium import *"
Then you can go to pages!
Code:
go_to('github.com/trending')
```<end_code>

You can directly click clickable elements by inputting the text that appears on them.
Code:
click("Top products")
```<end_code>

If it's a link:
Code:
click(Link("Top products"))
```<end_code>

To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
Code:
scroll_down(num_pixels=1200) # This will scroll one viewport down
```<end_code>

When you have pop-ups, use your built-in tool `close_popups`.

You can use .exists() to check for the existence of an element. For example:
Code:
if Text('Accept cookies?').exists():
    click('I accept')
```<end_code>
"""
# --- تشغيل الوكيل للمهمة الأولى ---

search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

print("\n--- Starting Wikipedia Task ---")
agent_output = agent.run(search_request + helium_instructions)
print("\n--- Final output for Wikipedia Task: ---")
print(agent_output)

# إغلاق المتصفح في النهاية
helium.kill_browser()

Using temporary user data directory: /tmp/tmpk0qzeab4




Browser started successfully!
Loading local model... This may take a while.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully.

--- Starting Wikipedia Task ---


Captured a browser screenshot: (1000, 1158) pixels


AgentGenerationError: Error in generating model output:
'DynamicCache' object has no attribute 'seen_tokens'

In [None]:
microsoft/Florence-2-large

In [5]:
# -*- coding: utf-8 -*-
from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

from smolagents import CodeAgent, tool, ActionStep, TransformersModel

# تحميل متغيرات البيئة (إذا كنت تستخدم ملف .env)
load_dotenv()

# --- تعريف الأدوات التي سيستخدمها الوكيل ---

@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """
    Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
    Args:
        text: The text to search for
        nth_result: Which occurrence to jump to (default: 1)
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements:
        return f"No matches found for '{text}'."
    if nth_result > len(elements):
        return f"Match n°{nth_result} not found (only {len(elements)} matches found)"

    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """
    Closes any visible modal or pop-up on the page by sending the ESCAPE key.
    """
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

# --- إعداد متصفح الويب (Selenium + Helium) ---

user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

# --- دالة رد الاتصال (Callback) لأخذ لقطات الشاشة ---

def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]

    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
    )

# --- القسم المصحح: تهيئة النموذج المحلي ---

print("Loading local model... This may take a while.")
model_id = "microsoft/phi-3-vision-128k-instruct"

# تم نقل `torch_dtype="auto"` كمعامل مباشر خارج `model_kwargs`
model = TransformersModel(
    model_id=model_id,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
    model_kwargs={
        # الوسائط الأقل شيوعًا أو الخاصة تبقى هنا
        "_attn_implementation": "eager"
    }
)
print("Model loaded successfully.")


# --- تهيئة الوكيل ---

agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=8,
    verbosity_level=2,
)

# --- تعليمات الوكيل ---

agent.python_executor("from helium import *")

helium_instructions = """
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
We've already ran "from helium import *"
Then you can go to pages!
Code:
go_to('github.com/trending')
```<end_code>

You can directly click clickable elements by inputting the text that appears on them.
Code:
click("Top products")
```<end_code>

If it's a link:
Code:
click(Link("Top products"))
```<end_code>

To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
Code:
scroll_down(num_pixels=1200) # This will scroll one viewport down
```<end_code>

When you have pop-ups, use your built-in tool `close_popups`.

You can use .exists() to check for the existence of an element. For example:
Code:
if Text('Accept cookies?').exists():
    click('I accept')
```<end_code>
"""
# --- تشغيل الوكيل للمهمة الأولى ---

search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

print("\n--- Starting Wikipedia Task ---")
agent_output = agent.run(search_request + helium_instructions)
print("\n--- Final output for Wikipedia Task: ---")
print(agent_output)

# إغلاق المتصفح في النهاية
helium.kill_browser()

Using temporary user data directory: /tmp/tmp4dmjg_in




Browser started successfully!
Loading local model... This may take a while.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Model loaded successfully.

--- Starting Wikipedia Task ---


Captured a browser screenshot: (1000, 1158) pixels


AgentGenerationError: Error in generating model output:
'DynamicCache' object has no attribute 'seen_tokens'

In [3]:
# -*- coding: utf-8 -*-
# تأكد من تشغيل pip install transformers==4.40.2 وإعادة تشغيل الـ runtime قبل هذا الكود

from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

from smolagents import CodeAgent, tool, ActionStep, TransformersModel

# تحميل متغيرات البيئة (إذا كنت تستخدم ملف .env)
load_dotenv()

# --- تعريف الأدوات التي سيستخدمها الوكيل ---

@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """
    Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
    Args:
        text: The text to search for
        nth_result: Which occurrence to jump to (default: 1)
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements:
        return f"No matches found for '{text}'."
    if nth_result > len(elements):
        return f"Match n°{nth_result} not found (only {len(elements)} matches found)"

    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """
    Closes any visible modal or pop-up on the page by sending the ESCAPE key.
    """
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

# --- إعداد متصفح الويب (Selenium + Helium) ---

user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

# --- دالة رد الاتصال (Callback) لأخذ لقطات الشاشة ---

def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]

    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
    )

# --- تهيئة النموذج المحلي ---

print("Loading local model... This may take a while.")
model_id = "microsoft/phi-3-vision-128k-instruct"

model = TransformersModel(
    model_id=model_id,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
    model_kwargs={
        "_attn_implementation": "eager"
    }
)
print("Model loaded successfully.")

# --- تهيئة الوكيل ---

agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=2,
    verbosity_level=2,
)

# --- تعليمات الوكيل ---

agent.python_executor("from helium import *")

helium_instructions = """
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
We've already ran "from helium import *"
Then you can go to pages!
Code:
go_to('github.com/trending')
```<end_code>
You can directly click clickable elements by inputting the text that appears on them.
Code:
click("Top products")
```<end_code>
To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
Code:
scroll_down(num_pixels=800)
```<end_code>
When you have pop-ups, use your built-in tool `close_popups`.
"""
# --- تشغيل الوكيل للمهمة الأولى ---

search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

print("\n--- Starting Wikipedia Task ---")
agent_output = agent.run(search_request + helium_instructions)
print("\n--- Final output for Wikipedia Task: ---")
print(agent_output)

# إغلاق المتصفح في النهاية
helium.kill_browser()

Using temporary user data directory: /tmp/tmptz8038hg




Browser started successfully!
Loading local model... This may take a while.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully.

--- Starting Wikipedia Task ---


Captured a browser screenshot: (1000, 1158) pixels


AgentGenerationError: Error in generating model output:
'DynamicCache' object has no attribute 'seen_tokens'

In [5]:
# -*- coding: utf-8 -*-
# قم بتشغيل هذا الكود فقط بعد التأكد من أن إصدار transformers هو 4.40.2

from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

from smolagents import CodeAgent, tool, ActionStep, TransformersModel

# تحميل متغيرات البيئة (إذا كنت تستخدم ملف .env)
load_dotenv()

# --- تعريف الأدوات التي سيستخدمها الوكيل ---

@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """
    Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
    Args:
        text: The text to search for
        nth_result: Which occurrence to jump to (default: 1)
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements:
        return f"No matches found for '{text}'."
    if nth_result > len(elements):
        return f"Match n°{nth_result} not found (only {len(elements)} matches found)"

    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """
    Closes any visible modal or pop-up on the page by sending the ESCAPE key.
    """
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

# --- إعداد متصفح الويب (Selenium + Helium) ---

user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

# --- دالة رد الاتصال (Callback) لأخذ لقطات الشاشة ---

def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]

    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
    )

# --- تهيئة النموذج المحلي ---

print("Loading local model... This may take a while.")
model_id = "microsoft/phi-3-vision-128k-instruct"

model = TransformersModel(
    model_id=model_id,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
    model_kwargs={
        "_attn_implementation": "eager"
    }
)
print("Model loaded successfully.")

# --- تهيئة الوكيل ---

agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=2,
    verbosity_level=2,
)

# --- تعليمات الوكيل ---

agent.python_executor("from helium import *")

helium_instructions = """
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
We've already ran "from helium import *"
Then you can go to pages!
Code:
go_to('github.com/trending')
```<end_code>
You can directly click clickable elements by inputting the text that appears on them.
Code:
click("Top products")
```<end_code>
To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
Code:
scroll_down(num_pixels=800)
```<end_code>
When you have pop-ups, use your built-in tool `close_popups`.
"""
# --- تشغيل الوكيل للمهمة الأولى ---

search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

print("\n--- Starting Wikipedia Task ---")
agent_output = agent.run(search_request + helium_instructions)
print("\n--- Final output for Wikipedia Task: ---")
print(agent_output)

# إغلاق المتصفح في النهاية
helium.kill_browser()

Using temporary user data directory: /tmp/tmp6_rgkgeb




Browser started successfully!
Loading local model... This may take a while.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Model loaded successfully.

--- Starting Wikipedia Task ---


Captured a browser screenshot: (1000, 1158) pixels


AgentGenerationError: Error in generating model output:
'DynamicCache' object has no attribute 'seen_tokens'

In [1]:
# -*- coding: utf-8 -*-
import sys
import transformers

# --- حارس التحقق من الإصدار (Version Check Guard) ---
# هذا الكود سيتوقف فورًا إذا لم يكن إصدار المكتبة صحيحًا.
REQUIRED_VERSION = "4.40.2"
if transformers.__version__ != REQUIRED_VERSION:
    print("="*60)
    print(f"خطأ فادح: إصدار transformers غير صحيح!")
    print(f"الإصدار المطلوب: {REQUIRED_VERSION}")
    print(f"الإصدار المثبت حاليًا: {transformers.__version__}")
    print("الرجاء تنفيذ 'Runtime -> Factory reset runtime' ثم تشغيل خلية التثبيت مرة أخرى.")
    print("="*60)
    sys.exit("إيقاف التنفيذ بسبب عدم توافق الإصدارات.")

print(f"تم التحقق بنجاح. إصدار transformers هو {transformers.__version__}.")
print("-" * 20)

# --- بقية الكود الخاص بك (بدون تغيير) ---
from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

from smolagents import CodeAgent, tool, ActionStep, TransformersModel

load_dotenv()

@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """Searches for text on the current page and jumps to the nth occurrence."""
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements: return f"No matches found for '{text}'."
    if nth_result > len(elements): return f"Match n°{nth_result} not found (only {len(elements)} matches found)"
    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """Closes pop-ups by sending the ESCAPE key."""
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info)

print("Loading local model... This may take a while.")
model_id = "microsoft/phi-3-vision-128k-instruct"

model = TransformersModel(
    model_id=model_id,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
    model_kwargs={"_attn_implementation": "eager"}
)
print("Model loaded successfully.")

agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=8,
    verbosity_level=2,
)

agent.python_executor("from helium import *")
helium_instructions = """
You can use helium to access websites. We've already ran "from helium import *".
Go to pages with go_to('example.com'). Click elements with click("Button Text").
Scroll with scroll_down(num_pixels=800). Use your `close_popups` tool for pop-ups.
"""
search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

print("\n--- Starting Wikipedia Task ---")
agent_output = agent.run(search_request + helium_instructions)
print("\n--- Final output for Wikipedia Task: ---")
print(agent_output)

helium.kill_browser()

خطأ فادح: إصدار transformers غير صحيح!
الإصدار المطلوب: 4.40.2
الإصدار المثبت حاليًا: 4.55.1
الرجاء تنفيذ 'Runtime -> Factory reset runtime' ثم تشغيل خلية التثبيت مرة أخرى.


SystemExit: إيقاف التنفيذ بسبب عدم توافق الإصدارات.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
from PIL import Image
import requests
from transformers import AutoModelForCausalLM
from transformers import AutoProcessor

model_id = "microsoft/Phi-3-vision-128k-instruct"

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", trust_remote_code=True, torch_dtype="auto", _attn_implementation='flash_attention_2') # use _attn_implementation='eager' to disable flash attention

processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

messages = [
    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
    {"role": "assistant", "content": "The chart displays the percentage of respondents who agree with various statements about their preparedness for meetings. It shows five categories: 'Having clear and pre-defined goals for meetings', 'Knowing where to find the information I need for a meeting', 'Understanding my exact role and responsibilities when I'm invited', 'Having tools to manage admin tasks like note-taking or summarization', and 'Having more focus time to sufficiently prepare for meetings'. Each category has an associated bar indicating the level of agreement, measured on a scale from 0% to 100%."},
    {"role": "user", "content": "Provide insightful questions to spark discussion."}
]

url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png"
image = Image.open(requests.get(url, stream=True).raw)

prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")

generation_args = {
    "max_new_tokens": 500,
    "temperature": 0.0,
    "do_sample": False,
}

generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

# remove input tokens
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print(response)


config.json: 0.00B [00:00, ?B/s]

configuration_phi3_v.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct:
- configuration_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3_v.py: 0.00B [00:00, ?B/s]

image_embedding_phi3_v.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct:
- image_embedding_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct:
- modeling_phi3_v.py
- image_embedding_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.35G [00:00<?, ?B/s]

In [2]:
# -*- coding: utf-8 -*-
from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

from smolagents import CodeAgent, tool, ActionStep, TransformersModel

# تحميل متغيرات البيئة (إذا كنت تستخدم ملف .env)
load_dotenv()

# --- تعريف الأدوات التي سيستخدمها الوكيل ---

@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """
    Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
    Args:
        text: The text to search for
        nth_result: Which occurrence to jump to (default: 1)
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements:
        return f"No matches found for '{text}'."
    if nth_result > len(elements):
        return f"Match n°{nth_result} not found (only {len(elements)} matches found)"

    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """
    Closes any visible modal or pop-up on the page by sending the ESCAPE key.
    """
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

# --- إعداد متصفح الويب (Selenium + Helium) ---

user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

# --- دالة رد الاتصال (Callback) لأخذ لقطات الشاشة ---

def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]

    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
    )

# --- القسم المصحح: تهيئة النموذج المحلي ---

print("Loading local model... This may take a while.")
model_id = "microsoft/Florence-2-large"

# تم نقل `torch_dtype="auto"` كمعامل مباشر خارج `model_kwargs`
model = TransformersModel(
    model_id=model_id,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
    model_kwargs={
        # الوسائط الأقل شيوعًا أو الخاصة تبقى هنا
        "_attn_implementation": "eager"
    }
)
print("Model loaded successfully.")


# --- تهيئة الوكيل ---

agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=8,
    verbosity_level=2,
)

# --- تعليمات الوكيل ---

agent.python_executor("from helium import *")

helium_instructions = """
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
We've already ran "from helium import *"
Then you can go to pages!
Code:
go_to('github.com/trending')
```<end_code>

You can directly click clickable elements by inputting the text that appears on them.
Code:
click("Top products")
```<end_code>

If it's a link:
Code:
click(Link("Top products"))
```<end_code>

To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
Code:
scroll_down(num_pixels=1200) # This will scroll one viewport down
```<end_code>

When you have pop-ups, use your built-in tool `close_popups`.

You can use .exists() to check for the existence of an element. For example:
Code:
if Text('Accept cookies?').exists():
    click('I accept')
```<end_code>
"""
# --- تشغيل الوكيل للمهمة الأولى ---

search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

print("\n--- Starting Wikipedia Task ---")
agent_output = agent.run(search_request + helium_instructions)
print("\n--- Final output for Wikipedia Task: ---")
print(agent_output)

# إغلاق المتصفح في النهاية
helium.kill_browser()

Using temporary user data directory: /tmp/tmp9gb1h04d
Browser started successfully!
Loading local model... This may take a while.




config.json: 0.00B [00:00, ?B/s]

configuration_florence2.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-large:
- configuration_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_florence2.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-large:
- modeling_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.0 [00:00<?, ?B/s]

ValueError: Unrecognized configuration class <class 'transformers_modules.microsoft.Florence-2-large.21a599d414c4d928c9032694c424fb94458e3594.configuration_florence2.Florence2Config'> to build an AutoTokenizer.
Model type should be one of Aimv2Config, AlbertConfig, AlignConfig, ArceeConfig, AriaConfig, AyaVisionConfig, BarkConfig, BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BitNetConfig, BlenderbotConfig, BlenderbotSmallConfig, BlipConfig, Blip2Config, BloomConfig, BridgeTowerConfig, BrosConfig, CamembertConfig, CanineConfig, ChameleonConfig, ChineseCLIPConfig, ClapConfig, CLIPConfig, CLIPSegConfig, ClvpConfig, LlamaConfig, CodeGenConfig, CohereConfig, Cohere2Config, ColPaliConfig, ColQwen2Config, ConvBertConfig, CpmAntConfig, CTRLConfig, Data2VecAudioConfig, Data2VecTextConfig, DbrxConfig, DebertaConfig, DebertaV2Config, DeepseekV2Config, DeepseekV3Config, DeepseekVLConfig, DeepseekVLHybridConfig, DiaConfig, DiffLlamaConfig, DistilBertConfig, DPRConfig, ElectraConfig, Emu3Config, ErnieConfig, Ernie4_5Config, Ernie4_5_MoeConfig, ErnieMConfig, EsmConfig, Exaone4Config, FalconConfig, FalconMambaConfig, FastSpeech2ConformerConfig, FlaubertConfig, FNetConfig, FSMTConfig, FunnelConfig, GemmaConfig, Gemma2Config, Gemma3Config, Gemma3TextConfig, Gemma3nConfig, Gemma3nTextConfig, GitConfig, GlmConfig, Glm4Config, Glm4MoeConfig, Glm4vConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GptOssConfig, GPTJConfig, GPTSanJapaneseConfig, GraniteConfig, GraniteMoeConfig, GraniteMoeHybridConfig, GraniteMoeSharedConfig, GroundingDinoConfig, GroupViTConfig, HeliumConfig, HubertConfig, IBertConfig, IdeficsConfig, Idefics2Config, Idefics3Config, InstructBlipConfig, InstructBlipVideoConfig, InternVLConfig, JambaConfig, JanusConfig, JetMoeConfig, JukeboxConfig, Kosmos2Config, LayoutLMConfig, LayoutLMv2Config, LayoutLMv3Config, LEDConfig, LiltConfig, LlamaConfig, Llama4Config, Llama4TextConfig, LlavaConfig, LlavaNextConfig, LlavaNextVideoConfig, LlavaOnevisionConfig, LongformerConfig, LongT5Config, LukeConfig, LxmertConfig, M2M100Config, MambaConfig, Mamba2Config, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MgpstrConfig, MiniMaxConfig, MistralConfig, MixtralConfig, MllamaConfig, MMGroundingDinoConfig, MobileBertConfig, ModernBertConfig, MoonshineConfig, MoshiConfig, MPNetConfig, MptConfig, MraConfig, MT5Config, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, NemotronConfig, NezhaConfig, NllbMoeConfig, NystromformerConfig, OlmoConfig, Olmo2Config, OlmoeConfig, OmDetTurboConfig, OneFormerConfig, OpenAIGPTConfig, OPTConfig, Owlv2Config, OwlViTConfig, PaliGemmaConfig, PegasusConfig, PegasusXConfig, PerceiverConfig, PersimmonConfig, PhiConfig, Phi3Config, PhimoeConfig, Pix2StructConfig, PixtralVisionConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2_5OmniConfig, Qwen2_5_VLConfig, Qwen2AudioConfig, Qwen2MoeConfig, Qwen2VLConfig, Qwen3Config, Qwen3MoeConfig, RagConfig, RealmConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RetriBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, ShieldGemma2Config, SiglipConfig, Siglip2Config, SmolLM3Config, Speech2TextConfig, Speech2Text2Config, SpeechT5Config, SplinterConfig, SqueezeBertConfig, StableLmConfig, Starcoder2Config, SwitchTransformersConfig, T5Config, T5GemmaConfig, TapasConfig, TransfoXLConfig, TvpConfig, UdopConfig, UMT5Config, VideoLlavaConfig, ViltConfig, VipLlavaConfig, VisualBertConfig, VitsConfig, VoxtralConfig, Wav2Vec2Config, Wav2Vec2BertConfig, Wav2Vec2ConformerConfig, WhisperConfig, XCLIPConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, xLSTMConfig, XmodConfig, YosoConfig, ZambaConfig, Zamba2Config.

In [4]:
!pip uninstall -y transformers accelerate
!pip install transformers==4.40.2 accelerate==0.28.0 smolagents helium selenium python-dotenv

Found existing installation: transformers 4.55.1
Uninstalling transformers-4.55.1:
  Successfully uninstalled transformers-4.55.1
Found existing installation: accelerate 1.10.0
Uninstalling accelerate-1.10.0:
  Successfully uninstalled accelerate-1.10.0
Collecting transformers==4.40.2
  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.28.0
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.2)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K   [90m━━

In [1]:


# -*- coding: utf-8 -*-
import sys
import transformers

# --- حارس التحقق من الإصدار (Version Check Guard) ---
REQUIRED_VERSION = "4.40.2"
if transformers.__version__ != REQUIRED_VERSION:
    print("="*60)
    print(f"خطأ فادح: إصدار transformers غير صحيح!")
    print(f"الإصدار المطلوب: {REQUIRED_VERSION}")
    print(f"الإصدار المثبت حاليًا: {transformers.__version__}")
    print("الرجاء تنفيذ 'Runtime -> Factory reset runtime' ثم تشغيل خلية التثبيت في الخطوة 2 مرة أخرى.")
    print("="*60)
    sys.exit("إيقاف التنفيذ بسبب عدم توافق الإصدارات.")

print(f"تم التحقق بنجاح. إصدار transformers هو {transformers.__version__}.")
print("-" * 20)

# --- بقية الكود الخاص بك (بدون تغيير) ---
from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

from smolagents import CodeAgent, tool, ActionStep, TransformersModel

load_dotenv()

@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """Searches for text on the current page and jumps to the nth occurrence."""
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements: return f"No matches found for '{text}'."
    if nth_result > len(elements): return f"Match n°{nth_result} not found (only {len(elements)} matches found)"
    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """Closes pop-ups by sending the ESCAPE key."""
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info)

print("Loading local model... This may take a while.")
model_id = "microsoft/phi-3-vision-128k-instruct"

model = TransformersModel(
    model_id=model_id,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
    model_kwargs={"_attn_implementation": "eager"}
)
print("Model loaded successfully.")

agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=8,
    verbosity_level=2,
)

agent.python_executor("from helium import *")
helium_instructions = """
You can use helium to access websites. We've already ran "from helium import *".
Go to pages with go_to('example.com'). Click elements with click("Button Text").
Scroll with scroll_down(num_pixels=800). Use your `close_popups` tool for pop-ups.
"""
search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

print("\n--- Starting Wikipedia Task ---")
agent_output = agent.run(search_request + helium_instructions)
print("\n--- Final output for Wikipedia Task: ---")
print(agent_output)

helium.kill_browser()

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

تم التحقق بنجاح. إصدار transformers هو 4.40.2.
--------------------


DocstringParsingException: Cannot generate JSON schema for search_item_ctrl_f because the docstring has no description for the argument 'text'

In [2]:
# -*- coding: utf-8 -*-
import sys
import transformers
import warnings

# تجاهل التحذيرات غير الهامة
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.module")

# --- حارس التحقق من الإصدار (Version Check Guard) ---
REQUIRED_VERSION = "4.40.2"
if transformers.__version__ != REQUIRED_VERSION:
    print("="*60)
    print(f"خطأ فادح: إصدار transformers غير صحيح!")
    print(f"الإصدار المطلوب: {REQUIRED_VERSION}")
    print(f"الإصدار المثبت حاليًا: {transformers.__version__}")
    print("الرجاء تنفيذ 'Runtime -> Factory reset runtime' ثم تشغيل خلية التثبيت مرة أخرى.")
    print("="*60)
    sys.exit("إيقاف التنفيذ بسبب عدم توافق الإصدارات.")

print(f"تم التحقق بنجاح. إصدار transformers هو {transformers.__version__}.")
print("-" * 20)

# --- بقية الكود الخاص بك ---
from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

from smolagents import CodeAgent, tool, ActionStep, TransformersModel

load_dotenv()

# --- تعريف الأدوات التي سيستخدمها الوكيل (مع التعليق التوضيحي المصحح) ---

@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """Searches for text on the current page and jumps to the nth occurrence.

    Args:
        text (str): The text to search for on the page.
        nth_result (int): Which occurrence to jump to (e.g., 1 for the first match). Defaults to 1.
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements: return f"No matches found for '{text}'."
    if nth_result > len(elements): return f"Match n°{nth_result} not found (only {len(elements)} matches found)"
    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """Closes pop-ups by sending the ESCAPE key."""
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info)

print("Loading local model... This may take a while.")
model_id = "microsoft/phi-3-vision-128k-instruct"

model = TransformersModel(
    model_id=model_id,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
    model_kwargs={"_attn_implementation": "eager"}
)
print("Model loaded successfully.")

agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=8,
    verbosity_level=2,
)

agent.python_executor("from helium import *")
helium_instructions = """
You can use helium to access websites. We've already ran "from helium import *".
Go to pages with go_to('example.com'). Click elements with click("Button Text").
Scroll with scroll_down(num_pixels=800). Use your `close_popups` tool for pop-ups.
"""
search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

print("\n--- Starting Wikipedia Task ---")
agent_output = agent.run(search_request + helium_instructions)
print("\n--- Final output for Wikipedia Task: ---")
print(agent_output)

helium.kill_browser()

تم التحقق بنجاح. إصدار transformers هو 4.40.2.
--------------------
Using temporary user data directory: /tmp/tmpd28kzx0n
Browser started successfully!
Loading local model... This may take a while.


ImportError: cannot import name 'AutoModelForImageTextToText' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)

In [3]:
# -*- coding: utf-8 -*-
import sys
import transformers
import warnings

# تجاهل التحذيرات غير الهامة
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.module")

# --- حارس التحقق من الإصدار (Version Check Guard) ---
REQUIRED_VERSION = "4.40.2"
if transformers.__version__ != REQUIRED_VERSION:
    print("="*60)
    print(f"خطأ فادح: إصدار transformers غير صحيح!")
    print(f"الإصدار المطلوب: {REQUIRED_VERSION}")
    print(f"الإصدار المثبت حاليًا: {transformers.__version__}")
    print("الرجاء تنفيذ 'Runtime -> Factory reset runtime' ثم تشغيل خلية التثبيت مرة أخرى.")
    print("="*60)
    sys.exit("إيقاف التنفيذ بسبب عدم توافق الإصدارات.")

print(f"تم التحقق بنجاح. إصدار transformers هو {transformers.__version__}.")
print("-" * 20)

# --- بقية الكود الخاص بك ---
from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

from smolagents import CodeAgent, tool, ActionStep, TransformersModel

load_dotenv()

# --- تعريف الأدوات التي سيستخدمها الوكيل ---

@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """Searches for text on the current page and jumps to the nth occurrence.

    Args:
        text (str): The text to search for on the page.
        nth_result (int): Which occurrence to jump to (e.g., 1 for the first match). Defaults to 1.
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements: return f"No matches found for '{text}'."
    if nth_result > len(elements): return f"Match n°{nth_result} not found (only {len(elements)} matches found)"
    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """Closes pop-ups by sending the ESCAPE key."""
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

# --- إعداد المتصفح ---
user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

# --- دالة رد الاتصال (Callback) ---
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info)

# --- تهيئة النموذج المحلي (مع نموذج LLaVA المتوافق) ---
print("Loading local model... This may take a while.")
# --- تم تغيير معرّف النموذج ---
# LLaVA هو نموذج VLM معروف ومتوافق مع الإصدارات القديمة من transformers.
model_id = "llava-hf/llava-1.5-7b-hf"

model = TransformersModel(
    model_id=model_id,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
    model_kwargs={"_attn_implementation": "eager"}
)
print("Model loaded successfully.")

# --- تهيئة الوكيل ---
agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=8,
    verbosity_level=2,
)

# --- تعليمات الوكيل ---
agent.python_executor("from helium import *")
helium_instructions = """
You can use helium to access websites. We've already ran "from helium import *".
Go to pages with go_to('example.com'). Click elements with click("Button Text").
Scroll with scroll_down(num_pixels=800). Use your `close_popups` tool for pop-ups.
"""
search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

print("\n--- Starting Wikipedia Task ---")
agent_output = agent.run(search_request + helium_instructions)
print("\n--- Final output for Wikipedia Task: ---")
print(agent_output)

helium.kill_browser()

تم التحقق بنجاح. إصدار transformers هو 4.40.2.
--------------------
Using temporary user data directory: /tmp/tmpdqqo9jch
Browser started successfully!
Loading local model... This may take a while.


ImportError: cannot import name 'AutoModelForImageTextToText' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)

In [6]:
# -*- coding: utf-8 -*-
import sys
import transformers
import warnings

# --- التحقق من الإصدار ---
REQUIRED_VERSION = "4.40.2"
if transformers.__version__ != REQUIRED_VERSION:
    print(f"خطأ: إصدار transformers غير صحيح! المطلوب: {REQUIRED_VERSION}, المثبت: {transformers.__version__}")
    sys.exit("إيقاف التنفيذ.")
print(f"تم التحقق بنجاح. إصدار transformers هو {transformers.__version__}.")
print("-" * 20)

# --- بقية الاستيرادات ---
from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

# --- استيرادات LangChain الجديدة ---
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.prompts import PromptTemplate
from langchain_core.tools import tool
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

# --- إعداد المتصفح والأدوات (بدون تغيير) ---
load_dotenv()

@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """Searches for text on the current page and jumps to the nth occurrence."""
    # ... (الكود هنا لم يتغير)
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements: return f"No matches found for '{text}'."
    if nth_result > len(elements): return f"Match n°{nth_result} not found (only {len(elements)} matches found)"
    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def go_to_url(url: str) -> str:
    """Navigates the browser to the specified URL."""
    helium.go_to(url)
    return f"Navigated to {url}"

# إعداد المتصفح
user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1000,1350")
driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

# --- القسم الجديد: تهيئة النموذج والوكيل باستخدام LangChain ---

print("Loading local model via LangChain... This may take a while.")

# استخدام HuggingFacePipeline للتحكم الكامل في تحميل النموذج
# LLaVA ليس نموذج نصي فقط، لذلك سنستخدم نموذجًا أبسط متوافقًا مع الوكلاء النصيين
# مثل `mistralai/Mistral-7B-Instruct-v0.2` لإثبات أن المنهجية تعمل.
# ملاحظة: التحكم في المتصفح بالرؤية يتطلب بنية وكيل أكثر تعقيدًا (multi-modal agent).
llm = HuggingFacePipeline.from_model_id(
    model_id="Qwen/Qwen2.5-0.5B",
    task="text-generation",
    model_kwargs={
        "torch_dtype": torch.float32,
        "device_map": "auto",
        "trust_remote_code": True,
    },
    pipeline_kwargs={"max_new_tokens": 10},
    device=-1 # Explicitly set device to -1 to avoid conflict with device_map
)
print("Model loaded successfully.")

# تعريف الأدوات للوكيل
tools = [search_item_ctrl_f, go_back, go_to_url]

# إنشاء قالب التوجيه (Prompt Template) للوكيل
# هذا القالب يوجه النموذج لكيفية استخدام الأدوات والتفكير خطوة بخطوة
prompt_template = """Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}"""

prompt = PromptTemplate.from_template(prompt_template)

# Define the tools_on_screen_prompt
tools_on_screen_prompt = "You can use the following tools: search_item_ctrl_f, go_back, go_to_url"


# إنشاء الوكيل
agent = create_react_agent(llm, tools, prompt, tools_on_screen_prompt=tools_on_screen_prompt) # Add the missing argument

# إنشاء منفذ الوكيل (Agent Executor) الذي سيقوم بتشغيل دورة التفكير
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

# --- تشغيل الوكيل ---

search_request = """
First, navigate to https://en.wikipedia.org/wiki/Chicago. Then, search for the word "1992" on the page and tell me the sentence that mentions a construction accident.
"""

print("\n--- Starting Wikipedia Task with LangChain Agent ---")
try:
    result = agent_executor.invoke({"input": search_request})
    print("\n--- Final output for Wikipedia Task: ---")
    print(result.get("output"))
except Exception as e:
    print(f"An error occurred: {e}")

helium.kill_browser()

تم التحقق بنجاح. إصدار transformers هو 4.40.2.
--------------------
Using temporary user data directory: /tmp/tmpcx3kw1rh
Browser started successfully!
Loading local model via LangChain... This may take a while.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


ValueError: The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please discard the `device` argument when creating your pipeline object.

In [5]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading mypy_extensions-1.1.0-py3-n

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Import necessary modules for tools
import helium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from smolagents import tool, ActionStep, ToolCallingAgent, LocalPythonExecutor # Import LocalPythonExecutor
from PIL import Image
from io import BytesIO
from time import sleep
import tempfile
import atexit

# Define the tool functions (copied from n-jDM4I0peNf)
@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """Searches for text on the current page and jumps to the nth occurrence.

    Args:
        text (str): The text to search for on the page.
        nth_result (int): Which occurrence to jump to (e.g., 1 for the first match). Defaults to 1.
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements: return f"No matches found for '{text}'."
    if nth_result > len(elements): return f"Match n°{nth_result} not found (only {len(elements)} matches found)"
    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """Closes pop-ups by sending the ESCAPE key."""
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

# Setup browser (copied from n-jDM4I0peNf, adjusted for headless)
user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

# Screenshot callback (copied from n-jDM4I0peNf)
def save_screenshot(memory_step: ActionStep, agent: ToolCallingAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info)


MODEL_NAME = "HuggingFaceTB/SmolLM-135M-Instruct"

print("Downloading / loading the small local model …")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,   # CPU-safe
    device_map="auto"             # Falls back to CPU if no GPU
)

def small_local_vlm(messages, **kwargs):
    """
    Minimal wrapper that behaves like the cloud InferenceClientModel
    for CodeAgent (text-only, no vision).
    """
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        out_ids = model.generate(
            **inputs,
            max_new_tokens=kwargs.get("max_tokens", 512),
            temperature=0.0,
            do_sample=False
        )
    answer = tokenizer.decode(out_ids[0], skip_special_tokens=True)
    # strip the original prompt so we only return the assistant reply
    return answer.split("<|assistant|>")[-1].strip()

# Monkey-patch so the agent uses our local LLM
from smolagents import ToolCallingAgent  # or CodeAgent
class LocalModel:
    def __call__(self, messages, stop=None, **kw):
        return small_local_vlm(messages, **kw)

agent = ToolCallingAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=LocalModel(),          # <-- our free local model
    step_callbacks=[save_screenshot],
    max_steps=2,
    verbosity_level=2,
)
# agent.python_executor("from helium import *") # Removed this line
# Assuming search_request and helium_instructions are defined elsewhere or will be provided later
# agent.run(search_request + helium_instructions)

Using temporary user data directory: /tmp/tmps4vj9gjs
Browser started successfully!
Downloading / loading the small local model …




In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Import necessary modules for tools
import helium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from smolagents import tool, ActionStep, ToolCallingAgent
from PIL import Image
from io import BytesIO
from time import sleep
import tempfile
import atexit

# Define the tool functions (copied from n-jDM4I0peNf)
@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """Searches for text on the current page and jumps to the nth occurrence.

    Args:
        text (str): The text to search for on the page.
        nth_result (int): Which occurrence to jump to (e.g., 1 for the first match). Defaults to 1.
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements: return f"No matches found for '{text}'."
    if nth_result > len(elements): return f"Match n°{nth_result} not found (only {len(elements)} matches found)"
    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """Closes pop-ups by sending the ESCAPE key."""
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

# Setup browser (copied from n-jDM4I0peNf, adjusted for headless)
user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")
chrome_options.add_argument(f"--user-data-dir={user_data_dir}")

driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

# Screenshot callback (copied from n-jDM4I0peNf)
def save_screenshot(memory_step: ActionStep, agent: ToolCallingAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [image.copy()]
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info)


MODEL_NAME = "HuggingFaceTB/SmolLM-135M-Instruct"

print("Downloading / loading the small local model …")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,   # CPU-safe
    device_map="auto"             # Falls back to CPU if no GPU
)

def small_local_vlm(messages, **kwargs):
    """
    Minimal wrapper that behaves like the cloud InferenceClientModel
    for CodeAgent (text-only, no vision).
    """
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        out_ids = model.generate(
            **inputs,
            max_new_tokens=kwargs.get("max_tokens", 512),
            temperature=0.0,
            do_sample=False
        )
    answer = tokenizer.decode(out_ids[0], skip_special_tokens=True)
    # strip the original prompt so we only return the assistant reply
    return answer.split("<|assistant|>")[-1].strip()

# Monkey-patch so the agent uses our local LLM
from smolagents import ToolCallingAgent  # or CodeAgent
class LocalModel:
    def __call__(self, messages, stop=None, **kw):
        return small_local_vlm(messages, **kw)

tools_on_screen_prompt = """Available tools:
- go_back: Goes back to the previous page.
- close_popups: Closes pop-ups by sending the ESCAPE key.
- search_item_ctrl_f: Searches for text on the current page and jumps to the nth occurrence.
"""

agent = ToolCallingAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=LocalModel(),          # <-- our free local model
    tools_on_screen_prompt=tools_on_screen_prompt, # Add the required argument
    step_callbacks=[save_screenshot],
    max_steps=2,
    verbosity_level=2,
)
# agent.python_executor("from helium import *") # Removed this line
# Assuming search_request and helium_instructions are defined elsewhere or will be provided later
# agent.run(search_request + helium_instructions)

Using temporary user data directory: /tmp/tmppvjzc9ym
Browser started successfully!
Downloading / loading the small local model …




TypeError: MultiStepAgent.__init__() got an unexpected keyword argument 'tools_on_screen_prompt'

In [18]:
# -*- coding: utf-8 -*-
import sys
import transformers
import warnings

# تجاهل التحذيرات غير الهامة
warnings.filterwarnings("ignore")

# --- حارس التحقق من الإصدار ---
REQUIRED_VERSION = "4.40.2"
if transformers.__version__ != REQUIRED_VERSION:
    print("="*60)
    print(f"خطأ فادح: إصدار transformers غير صحيح!")
    print(f"الإصدار المطلوب: {REQUIRED_VERSION}, المثبت: {transformers.__version__}")
    print("الرجاء تنفيذ 'Runtime -> Factory reset runtime' ثم تشغيل خلية التثبيت مرة أخرى.")
    print("="*60)
    sys.exit("إيقاف التنفيذ.")

print(f"تم التحقق بنجاح. إصدار transformers هو {transformers.__version__}.")
print("-" * 20)

# --- بقية الكود ---
from io import BytesIO
from time import sleep
import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import tempfile
import atexit
import torch

from smolagents import CodeAgent, tool, ActionStep, TransformersModel

load_dotenv()

# --- تعريف الأدوات ---
@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
    """Searches for text on the current page and jumps to the nth occurrence.

    Args:
        text (str): The text to search for on the page.
        nth_result (int): Which occurrence to jump to. Defaults to 1.
    """
    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
    if not elements: return f"No matches found for '{text}'."
    if nth_result > len(elements): return f"Match n°{nth_result} not found (only {len(elements)} matches found)"
    result = f"Found {len(elements)} matches for '{text}'.\n"
    elem = elements[nth_result - 1]
    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elem)
    result += f"Focused on element {nth_result} of {len(elements)}"
    return result

@tool
def go_back() -> str:
    """Goes back to the previous page."""
    driver.back()
    return "Navigated back to the previous page."

@tool
def close_popups() -> str:
    """Closes pop-ups by sending the ESCAPE key."""
    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    return "Sent ESCAPE key to close pop-ups."

# --- إعداد المتصفح ---
user_data_dir = tempfile.mkdtemp()
print(f"Using temporary user data directory: {user_data_dir}")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1000,1350")
driver = helium.start_chrome(options=chrome_options)
atexit.register(helium.kill_browser)
print("Browser started successfully!")

# دالة لقطة الشاشة لا تزال مفيدة لنا للمراقبة، ولكن الوكيل لن يستخدم الصور
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)
    driver = helium.get_driver()
    if driver is not None:
        print(f"Taking screenshot of page: {driver.current_url}")
        # الوكيل لن يرى هذه الصورة، لكنها مفيدة لنا
        # memory_step.observations_images = [Image.open(BytesIO(driver.get_screenshot_as_png()))]
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info)

# --- تهيئة النموذج المحلي (نموذج نصي فقط) ---
print("Loading local TEXT-ONLY model... This may take a while.")
# --- تم تغيير النموذج إلى نموذج نصي لتجنب خطأ الاستيراد ---
model_id = "HuggingFaceTB/SmolLM-135M-Instruct"

model = TransformersModel(
    model_id=model_id,
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
)
print("Model loaded successfully.")

# --- تهيئة الوكيل ---
agent = CodeAgent(
    tools=[go_back, close_popups, search_item_ctrl_f],
    model=model,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=8,
    verbosity_level=2,
)

# --- تعليمات الوكيل ---
agent.python_executor("from helium import *")
helium_instructions = """
You are a web assistant. You can use helium to browse websites. We've already run "from helium import *".
- Use `go_to('example.com')` to navigate.
- Use `click("Button Text")` to click elements.
- Use `scroll_down(num_pixels=800)` to scroll.
- Use your tools like `search_item_ctrl_f` to find information.
- You are blind. You MUST rely on the text output from your actions and tools to decide the next step.
"""
search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

print("\n--- Starting Wikipedia Task ---")
agent_output = agent.run(search_request + helium_instructions)
print("\n--- Final output for Wikipedia Task: ---")
print(agent_output)

helium.kill_browser()

تم التحقق بنجاح. إصدار transformers هو 4.40.2.
--------------------
Using temporary user data directory: /tmp/tmpfbbnc1y9
Browser started successfully!
Loading local TEXT-ONLY model... This may take a while.


ImportError: cannot import name 'AutoModelForImageTextToText' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)

In [2]:
from transformers import GPT2Tokenizer, TFGPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = TFGPT2Model.from_pretrained('gpt2-medium')
text = "hi"
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)




tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2Model.

All the weights of TFGPT2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.
