In [1]:
# Copyright (c) Meta Platforms, Inc. and affiliates.

# SAM 3 Agent

This notebook shows an example of how an MLLM can use SAM 3 as a tool, i.e., "SAM 3 Agent", to segment more complex text queries such as "the leftmost child wearing blue vest".

## Env Setup

First install `sam3` in your environment using the [installation instructions](https://github.com/facebookresearch/sam3?tab=readme-ov-file#installation) in the repository.

In [2]:
%%capture
!conda create -n sam3 python=3.12
!conda deactivate
!conda activate sam3

In [3]:
!pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

Looking in indexes: https://download.pytorch.org/whl/cu126
Collecting torch==2.7.0
  Downloading https://download.pytorch.org/whl/cu126/torch-2.7.0%2Bcu126-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting nvidia-cudnn-cu12==9.5.1.17 (from torch==2.7.0)
  Downloading https://pypi.nvidia.com/nvidia-cudnn-cu12/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl (571.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cusparselt-cu12==0.6.3 (from torch==2.7.0)
  Downloading https://pypi.nvidia.com/nvidia-cusparselt-cu12/nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_x86_64.whl (156.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.8/156.8 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-nccl-cu12==2.26.2 (from torch==2.7.0)
  Downloading https://pypi.nvidia.com/nvidia-nccl-cu12/nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014

In [4]:
!pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

Looking in indexes: https://download.pytorch.org/whl/cu126


In [5]:
%%capture
# Remove existing sam3 directory if it exists to ensure a clean clone
!rm -rf sam3
!git clone https://github.com/facebookresearch/sam3.git

In [6]:
import fileinput
import os

sam3_root_dir = "/content/sam3" # Using absolute path, as SAM3_ROOT might not be defined yet

pyproject_path = os.path.join(sam3_root_dir, "pyproject.toml")
temp_file_path = pyproject_path + ".tmp"

# Check if pyproject.toml exists before trying to modify
if os.path.exists(pyproject_path):
    print(f"Attempting to modify {pyproject_path} to remove strict numpy version pinning.")
    modified_lines = []
    with open(pyproject_path, 'r') as infile:
        for line in infile:
            if 'numpy==' in line:
                # Replace the pinned version, allowing pip to use a compatible version
                modified_line = line.replace('numpy==1.26', 'numpy')
                modified_lines.append(modified_line)
            else:
                modified_lines.append(line)

    with open(temp_file_path, 'w') as outfile:
        outfile.writelines(modified_lines)

    os.replace(temp_file_path, pyproject_path)
    print(f"Modified {pyproject_path}.")
else:
    print(f"Warning: {pyproject_path} not found. Skipping pyproject.toml modification.")

# Original content of DPMUoaL8VrkE
!cd /content/sam3

Attempting to modify /content/sam3/pyproject.toml to remove strict numpy version pinning.
Modified /content/sam3/pyproject.toml.


In [7]:
%%capture
import os

# Ensure the current directory is sam3 for pip install
!cd sam3 && pip install -e .

In [8]:
%%capture
# For running example notebooks
!cd sam3 && pip install -e ".[notebooks]"

# For development
!cd sam3 && pip install -e ".[train,dev]"

In [9]:
%%capture
import torch
# turn on tfloat32 for Ampere GPUs
# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# use bfloat16 for the entire notebook. If your card doesn't support it, try float16 instead
torch.autocast("cuda", dtype=torch.bfloat16).__enter__()

# inference mode for the whole notebook. Disable if you need gradients
torch.inference_mode().__enter__()

In [10]:

%%capture
!hf login

In [11]:
import os

# SAM3_ROOT should point to the cloned sam3 repository
SAM3_ROOT = "/content/sam3"
os.chdir(SAM3_ROOT)

# setup GPU to use -  A single GPU is good with the purpose of this demo
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
_ = os.system("nvidia-smi")

## Build SAM3 Model

In [12]:
import sys
import os

# SAM3_ROOT is expected to be '/content/sam3' from previous steps
SAM3_ROOT = "/content/sam3"
if SAM3_ROOT not in sys.path:
    sys.path.insert(0, SAM3_ROOT)

import sam3
from sam3 import build_sam3_image_model
from sam3.model.sam3_image_processor import Sam3Processor

sam3_root = os.path.dirname(sam3.__file__)
bpe_path = f"{sam3_root}/assets/bpe_simple_vocab_16e6.txt.gz"
model = build_sam3_image_model(bpe_path=bpe_path)
processor = Sam3Processor(model, confidence_threshold=0.5)

config.json:   0%|          | 0.00/25.8k [00:00<?, ?B/s]

sam3.pt:   0%|          | 0.00/3.45G [00:00<?, ?B/s]

## LLM Setup

Config which MLLM to use, it can either be a model served by vLLM that you launch from your own machine or a model is served via external API. If you want to using a vLLM model, we also provided insturctions below.

In [13]:
LLM_CONFIGS = {
    # models served via external APIs
    "gemini-3-pro-preview": {
        "provider": "gemini",
        "model": "gemini-3-pro-preview",
        "base_url": "", # Not needed for Gemini API
    },
}

model = "gemini-3-pro-preview"
# Note: GEMINI_API_KEY should be stored securely in Colab Secrets
LLM_API_KEY = "DUMMY_API_KEY" # This will be replaced by actual key from Colab secrets

llm_config = LLM_CONFIGS[model]
llm_config["api_key"] = LLM_API_KEY # This will be overwritten by actual key in setup_gemini_api_key cell
llm_config["name"] = model

# Setup API endpoint (not strictly needed for Gemini with direct SDK call, but keeping structure)
if llm_config["provider"] == "vllm":
    LLM_SERVER_URL = "http://0.0.0.0:8001/v1"  # replace this with your vLLM server address as needed
else:
    LLM_SERVER_URL = llm_config["base_url"] if "base_url" in llm_config and llm_config["base_url"] else ""


### Setup vLLM server
This step is only required if you are using a model served by vLLM, skip this step if you are calling LLM using an API like Gemini and GPT.

* Install vLLM (in a separate conda env from SAM 3 to avoid dependency conflicts).
  ```bash
    conda create -n vllm python=3.12
    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
  ```
* Start vLLM server on the same machine of this notebook
  ```bash
    # qwen 3 VL 8B thinking
    vllm serve Qwen/Qwen3-VL-8B-Thinking --tensor-parallel-size 4 --allowed-local-media-path / --enforce-eager --port 8001
  ```

## Run SAM3 Agent Inference

In [22]:
from functools import partial
from IPython.display import display, Image
from sam3.agent.client_llm import send_generate_request as send_generate_request_orig
from sam3.agent.client_sam3 import call_sam_service as call_sam_service_orig
from sam3.agent.inference import run_single_image_inference

# Ensure google-generativeai is up to date
!pip install --upgrade google-generativeai

# Install packages as per user's suggestion
!pip install -U -q "google"
!pip install -U -q "google.genai"

import google.generativeai as genai
from google.colab import userdata
import os # Import os for environment variable

try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GOOGLE_API_KEY)
    # Set environment variable as suggested by user's snippet
    os.environ["GEMINI_API_KEY"] = GOOGLE_API_KEY
    print("Gemini API key configured from Colab secrets and environment.")
except userdata.SecretNotFoundError:
    print("WARNING: GOOGLE_API_KEY not found in Colab secrets. Please add it to secrets manager.")
    print("Using a DUMMY_API_KEY, requests to Gemini will likely fail.")
    GOOGLE_API_KEY = "DUMMY_API_KEY"
    os.environ["GEMINI_API_KEY"] = GOOGLE_API_KEY # Still set dummy for consistency

# Adjust send_generate_request to use Gemini SDK directly if provider is 'gemini'
def send_generate_request_gemini(messages, model_name, api_key, server_url):
    llm_model = genai.GenerativeModel(model_name)

    gemini_formatted_messages = []
    for msg in messages:
        role = 'model' if msg['role'] == 'assistant' else msg['role']
        gemini_formatted_messages.append(
            {'role': role, 'parts': [{'text': msg['content']}]}
        )

    # Explicitly define GenerationConfig with a temperature
    generation_config = genai.GenerationConfig(
        temperature=0.7,  # Default creative temperature
        max_output_tokens=2048 # A common maximum token limit
    )

    try:
        response = llm_model.generate_content(gemini_formatted_messages, generation_config=generation_config)
    except Exception as e:
        print(f"DEBUG: Error during generate_content call: {e}")
        # Return a string with the error message as expected by agent_core.py
        return f"ERROR: Gemini API call failed: {e}"

    generated_text = ""
    if hasattr(response, 'text'):
        generated_text = response.text
    elif hasattr(response, 'candidates') and response.candidates:
        # Get the first candidate's content
        candidate_content = response.candidates[0].content
        if hasattr(candidate_content, 'parts') and candidate_content.parts:
            # Concatenate text from all parts of the first candidate
            generated_text = "".join(part.text for part in candidate_content.parts if hasattr(part, 'text'))

        # Check for safety reasons if no text was generated or if a block occurred
        if not generated_text:
            if hasattr(response.candidates[0], 'finish_reason') and response.candidates[0].finish_reason == 'SAFETY':
                generated_text = "ERROR: Gemini response was blocked due to safety concerns."
            elif hasattr(response.candidates[0], 'finish_reason') and response.candidates[0].finish_reason == 'RECITATION':
                generated_text = "ERROR: Gemini response was blocked due to recitation."
            elif hasattr(response, 'prompt_feedback') and response.prompt_feedback and hasattr(response.prompt_feedback, 'block_reason'):
                generated_text = f"ERROR: Prompt was blocked: {response.prompt_feedback.block_reason}"


    if not generated_text:
        # Fallback if no text could be extracted or other unexpected response
        print(f"DEBUG: Gemini response did not contain accessible text. Full response object: {response}")
        generated_text = f"ERROR: Could not extract text from Gemini response. Raw response: {str(response)}"

    # The SAM3 agent expects a plain string, not a dictionary wrapper
    return generated_text

# Overwrite send_generate_request if using Gemini
if llm_config["provider"] == "gemini":
    send_generate_request = partial(send_generate_request_gemini, model_name=llm_config["model"], api_key=llm_config["api_key"], server_url=LLM_SERVER_URL)
else:
    send_generate_request = partial(send_generate_request_orig, server_url=LLM_SERVER_URL, model=llm_config["model"], api_key=llm_config["api_key"])


Gemini API key configured from Colab secrets and environment.


In [23]:
# prepare input args and run single image inference
image = "/content/sam3/assets/images/test_image.jpg"
prompt = "the leftmost child wearing blue vest"
# image = os.path.abspath(image) # This is no longer needed as we use the absolute path directly

call_sam_service = partial(call_sam_service_orig, sam3_processor=processor)
output_image_path = run_single_image_inference(
    image, prompt, llm_config, send_generate_request, call_sam_service,
    debug=True, output_dir="agent_output"
)

# display output
if output_image_path is not None:
    display(Image(filename=output_image_path))

------------------------------ Starting SAM 3 Agent Session... ------------------------------ 
> Text prompt: the leftmost child wearing blue vest
> Image path: /content/sam3/assets/images/test_image.jpg



------------------------------ Round 1------------------------------



DEBUG: Error during generate_content call: 'ProtoType' object has no attribute 'DESCRIPTOR'

>>> MLLM Response [start]
ERROR: Gemini API call failed: 'ProtoType' object has no attribute 'DESCRIPTOR'
<<< MLLM Response [end]



ValueError: Invalid JSON in tool call: ERROR: Gemini API call failed: 'ProtoType' object has no attribute 'DESCRIPTOR'

### Fixing Protobuf and Google Generative AI Installation

The previous error `ProtoType' object has no attribute 'DESCRIPTOR'` often indicates an issue with the `protobuf` library or an incompatibility with `google-generativeai`. To resolve this, we will forcefully reinstall and upgrade these packages.

In [21]:
%%capture
# Force reinstall protobuf
!pip install --upgrade --force-reinstall protobuf

# Force reinstall and upgrade google-generativeai
!pip install --upgrade --force-reinstall google-generativeai

!pip install -U -q "google"
!pip install -U -q "google.genai"

import os
from google.colab import userdata
from google.colab import drive
os.environ["GEMINI_API_KEY"] = userdata.get("GOOGLE_API_KEY")

drive.mount("/content/drive")
# Please ensure that uploaded files are available in the AI Studio folder or change the working folder.
os.chdir("/content/drive/MyDrive/Google AI Studio")

### Fixing Protobuf and Google Generative AI Installation

The previous error `ProtoType' object has no attribute 'DESCRIPTOR'` often indicates an issue with the `protobuf` library or an incompatibility with `google-generativeai`. To resolve this, we will forcefully reinstall and upgrade these packages.

In [16]:
%%capture
# Force reinstall protobuf
!pip install --upgrade --force-reinstall protobuf

# Force reinstall and upgrade google-generativeai
!pip install --upgrade --force-reinstall google-generativeai

!pip install -U -q "google"
!pip install -U -q "google.genai"

import os
from google.colab import userdata
from google.colab import drive
os.environ["GEMINI_API_KEY"] = userdata.get("GOOGLE_API_KEY")

drive.mount("/content/drive")
# Please ensure that uploaded files are available in the AI Studio folder or change the working folder.
os.chdir("/content/drive/MyDrive/Google AI Studio")

In [None]:
# Check numpy version before torch installation
!pip show numpy

In [None]:
%%capture
!pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

In [None]:
# Check numpy version after torch installation
!pip show numpy

In [None]:
import fileinput
import os

sam3_root_dir = "/content/sam3" # Using absolute path

pyproject_path = os.path.join(sam3_root_dir, "pyproject.toml")
temp_file_path = pyproject_path + ".tmp"

# Check if pyproject.toml exists before trying to modify
if os.path.exists(pyproject_path):
    print(f"Attempting to modify {pyproject_path} to remove strict numpy version pinning.")
    modified_lines = []
    with open(pyproject_path, 'r') as infile:
        for line in infile:
            if 'numpy==' in line:
                # Replace the pinned version, allowing pip to use a compatible version
                modified_line = line.replace('numpy==1.26', 'numpy')
                modified_lines.append(modified_line)
            else:
                modified_lines.append(line)

    with open(temp_file_path, 'w') as outfile:
        outfile.writelines(modified_lines)

    os.replace(temp_file_path, pyproject_path)
    print(f"Modified {pyproject_path}.")
else:
    print(f"Warning: {pyproject_path} not found. Skipping pyproject.toml modification.")


In [None]:
# Check numpy version after pyproject.toml modification
!pip show numpy

In [None]:
%%capture
import os

# Use absolute path for pip install
!pip install -e /content/sam3

In [None]:
# Check numpy version after sam3 installation
!pip show numpy

In [None]:
%%capture
# For running example notebooks
!pip install -e "/content/sam3[notebooks]"

# For development
!pip install -e "/content/sam3[train,dev]"

In [None]:
# Check numpy version after sam3 extra installations
!pip show numpy

In [None]:
import torch
# turn on tfloat32 for Ampere GPUs
# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# use bfloat16 for the entire notebook. If your card doesn't support it, try float16 instead
torch.autocast("cuda", dtype=torch.bfloat16).__enter__()

# inference mode for the whole notebook. Disable if you need gradients
torch.inference_mode().__enter__()

In [18]:
from google.colab import drive
import os

drive.mount("/content/drive")
# Please ensure that uploaded files are available in the AI Studio folder or change the working folder.
os.chdir("/content/drive/MyDrive/Google AI Studio")

Mounted at /content/drive
